From 85c0bc840ce4b7507caa1f99a6ee8e498dc947c0 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 20:40:27 +0000 Subject: [PATCH 001/264] Use NNVM interface for upsampling. --- src/operator/nn/upsampling-inl.h | 224 +++++++++---------------------- src/operator/nn/upsampling.cc | 162 +++++++++++++++++----- src/operator/nn/upsampling.cu | 38 +----- 3 files changed, 199 insertions(+), 225 deletions(-) diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h index f660609ace28..91254dad9046 100644 --- a/src/operator/nn/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -35,6 +35,7 @@ #include #include #include "../operator_common.h" +#include "./deconvolution-inl.h" namespace mxnet { namespace op { @@ -82,17 +83,16 @@ struct UpSamplingParam : public dmlc::Parameter { }; // struct UpSamplingParam template -class UpSamplingNearestOp : public Operator { +class UpSamplingNearestOp { public: - explicit UpSamplingNearestOp(UpSamplingParam p) { + void Init(UpSamplingParam p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), static_cast(param_.num_args)); @@ -125,19 +125,14 @@ class UpSamplingNearestOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, + void Backward(const OpContext &ctx, const TBlob &out_grad, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); Stream *s = ctx.get_stream(); - Tensor grad = out_grad[up_enum::kOut].get(s); + Tensor grad = out_grad.get(s); if (param_.num_args > 1) { int begin = 0; for (int i = 0; i < param_.num_args; ++i) { @@ -181,154 +176,67 @@ class UpSamplingNearestOp : public Operator { UpSamplingParam param_; }; // class UpSamplingNearestOp -template -Operator *CreateOp(UpSamplingParam param, int dtype); - - -#if DMLC_USE_CXX11 -class UpSamplingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - std::vector ListArguments() const override { - if (param_.sample_type == up_enum::kNearest) { - std::vector ret; - for (int i = 0; i < param_.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } else { - return {"data", "weight"}; - } - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_GE(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - TShape oshape = dshape; - if (param_.sample_type == up_enum::kNearest) { - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - oshape[1] = 0; - for (auto& shape : *in_shape) { - CHECK_EQ(shape.ndim(), 4U) << \ - "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; - int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; - CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ - "does not divide output height of " << oh; - CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ - "does not divide output width of " << ow; - if (param_.multi_input_mode == up_enum::kSum) { - CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ - "Number of channels must be the same when multi_input_mode==sum"; - oshape[1] = shape[1]; - } else { - oshape[1] += shape[1]; - } - } - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - CHECK_EQ(dshape.ndim(), 4U) << \ - "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; - if (dshape.ndim() == 0) return false; - int kernel = 2 * param_.scale - param_.scale % 2; - SHAPE_ASSIGN_CHECK(*in_shape, - up_enum::kWeight, - mshadow::Shape4(dshape[1], 1, kernel, kernel)); - oshape = dshape; - } - oshape[2] = dshape[2] * param_.scale; - oshape[3] = dshape[3] * param_.scale; - out_shape->clear(); - out_shape->push_back(oshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } +static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) { + DeconvolutionParam p = DeconvolutionParam(); + int kernel = 2 * param.scale - param.scale % 2; + int stride = param.scale; + int pad = static_cast(ceil((param.scale - 1) / 2.)); + p.workspace = param.workspace; + p.num_group = param.num_filter; + p.num_filter = param.num_filter; + p.no_bias = true; + int shape[] = {1, 1}; + p.dilate = TShape(shape, shape + 2); + shape[0] = shape[1] = kernel; + p.kernel = TShape(shape, shape + 2); + shape[0] = shape[1] = stride; + p.stride = TShape(shape, shape + 2); + shape[0] = shape[1] = pad; + p.pad = TShape(shape, shape + 2); + return p; +} - OperatorProperty* Copy() const override { - auto ptr = new UpSamplingProp(); - ptr->param_ = this->param_; - return ptr; - } - - std::string TypeString() const override { - return "UpSampling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.sample_type == up_enum::kNearest) { - return {out_grad[up_enum::kOut]}; - } else { - return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]}; - } - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void UpSamplingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + static thread_local UpSamplingNearestOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} +template +void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + CHECK_EQ(inputs.size(), 1U); + static thread_local UpSamplingNearestOp op; + op.Init(param); + op.Backward(ctx, inputs[0], req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionGradCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} - private: - UpSamplingParam param_; -}; // class UpSamplingProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc index 8942e35ab325..87316a939718 100644 --- a/src/operator/nn/upsampling.cc +++ b/src/operator/nn/upsampling.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./upsampling-inl.h" @@ -30,51 +30,123 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; + +static bool UpSamplingShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const UpSamplingParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + TShape oshape = dshape; + if (param_.sample_type == up_enum::kNearest) { + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + oshape[1] = 0; + for (auto& shape : *in_shape) { + CHECK_EQ(shape.ndim(), 4U) << \ + "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; + int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; + CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ + "does not divide output height of " << oh; + CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ + "does not divide output width of " << ow; + if (param_.multi_input_mode == up_enum::kSum) { + CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ + "Number of channels must be the same when multi_input_mode==sum"; + oshape[1] = shape[1]; + } else { + oshape[1] += shape[1]; + } + } + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + CHECK_EQ(dshape.ndim(), 4U) << \ + "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; + if (dshape.ndim() == 0) return false; + int kernel = 2 * param_.scale - param_.scale % 2; + SHAPE_ASSIGN_CHECK(*in_shape, + up_enum::kWeight, + mshadow::Shape4(dshape[1], 1, kernel, kernel)); + oshape = dshape; + } + oshape[2] = dshape[2] * param_.scale; + oshape[3] = dshape[3] * param_.scale; + out_shape->clear(); + out_shape->push_back(oshape); + return true; +} + +static inline std::vector ListArguments(const UpSamplingParam& param) { + if (param.sample_type == up_enum::kNearest) { + std::vector ret; + for (int i = 0; i < param.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); } - }); - return op; + return ret; + } else { + return {"data", "weight"}; + } } -Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +static bool UpSamplingType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; } +struct UpSamplingGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const UpSamplingParam& param_ = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + if (param_.sample_type != up_enum::kNearest) { + heads.push_back(n->inputs[up_enum::kData]); + heads.push_back(n->inputs[up_enum::kWeight]); + } + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(UpSamplingParam); -MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp) +NNVM_REGISTER_OP(UpSampling) .describe("Performs nearest neighbor/bilinear up sampling to inputs.") +.set_num_inputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", UpSamplingShape) +.set_attr("FInferType", UpSamplingType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr("FCompute", UpSamplingCompute) +.set_attr("FGradient", UpSamplingGrad{"_backward_UpSampling"}) +.set_attr("key_var_num_args", "num_args") .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample") .add_arguments(UpSamplingParam::__FIELDS__()) -.set_key_var_num_args("num_args"); - -NNVM_REGISTER_OP(UpSampling) .set_attr("FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; @@ -82,5 +154,23 @@ NNVM_REGISTER_OP(UpSampling) var->attrs.dict["__init__"] = "[\"bilinear\", {}]"; } }); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_num_outputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_attr("TIsBackward", true) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", UpSamplingGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu index f83535a2b2e6..c5ff2fafd64a 100644 --- a/src/operator/nn/upsampling.cu +++ b/src/operator/nn/upsampling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./deconvolution-inl.h" @@ -29,36 +29,12 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; - } - }); - return op; -} + +NNVM_REGISTER_OP(UpSampling) +.set_attr("FCompute", UpSamplingCompute); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_attr("FCompute", UpSamplingGradCompute); } // namespace op } // namespace mxnet From e1fe097d1a9fa1a958c3c72a8aa4952521a52841 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 20:42:40 +0000 Subject: [PATCH 002/264] Use NNVM interface for convolution. --- src/operator/nn/convolution-inl.h | 340 ++++------------------------ src/operator/nn/convolution.cc | 355 +++++++++++++++++++++++++----- src/operator/nn/convolution.cu | 149 +++++++++++-- 3 files changed, 471 insertions(+), 373 deletions(-) diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 38971aefa2d3..4af16f0aa231 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -22,7 +22,7 @@ * \file convolution-inl.h * \brief * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ @@ -148,9 +148,9 @@ namespace mxnet { namespace op { template -class ConvolutionOp : public Operator { +class ConvolutionOp { public: - explicit ConvolutionOp(ConvolutionParam p) { + void Init(ConvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(DType); @@ -160,11 +160,10 @@ class ConvolutionOp : public Operator { << "Only support NCW, NCHW and NCDHW layout"; } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(req[conv::kOut], kWriteTo); @@ -233,18 +232,19 @@ class ConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector& out_grad, const std::vector& in_data, - const std::vector& out_data, const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { + const std::vector& in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); + // We expect 2 inputs: in data and weight. We don't need bias for + // computing gradient. size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_); @@ -386,299 +386,35 @@ class ConvolutionOp : public Operator { }; // class ConvolutionOp template -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class ConvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - if (dshp.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<3> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, - dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshp.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - CHECK_EQ(dshape[1] % param_.num_group, 0U) - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d convolution"; - Shape<5> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; - oshape[4] = dshape[4] ? - (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; - } - if (oshape[4] && param_.stride[2] == 1) { - dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCDHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - if (dshape[4] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; - } - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ConvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Convolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { + static thread_local ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - private: - // Adds symmetric padding to a data input (in one dimension) - index_t AddPad(index_t dsize, index_t pad) const { - return dsize + 2 * pad; - } +template +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + static thread_local ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - ConvolutionParam param_; -}; // class ConvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index ef8ec9034db2..bca8adcba2a0 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -21,10 +21,11 @@ * Copyright (c) 2017 by Contributors * \file convolution.cc * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include "./convolution-inl.h" +#include "../elemwise_op_common.h" #if MXNET_USE_MKL2017 == 1 #include #include "../mkl/mkl_memory-inl.h" @@ -38,63 +39,277 @@ namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(ConvolutionParam); -template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; +static inline index_t AddPad(index_t dsize, index_t pad) { + return dsize + 2 * pad; +} + +static inline std::vector ListArguments(const ConvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; } -#if MXNET_USE_MKL2017 == 1 - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLConvolutionOp(param); - case mshadow::kFloat64: - return new MKLConvolutionOp(param); - default: - break; +} + +static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + using namespace mshadow; + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + if (dshp.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<3> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, + dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshp.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + CHECK_EQ(dshape[1] % param_.num_group, 0U) + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d convolution"; + Shape<5> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; + oshape[4] = dshape[4] ? + (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; + } + if (oshape[4] && param_.stride[2] == 1) { + dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCDHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + if (dshape[4] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; + } + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; } -#endif -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2 && (!param.no_bias) - && param.num_group == 1 && (batch_size == 1 || - ((batch_size > 1) && (param.stride[0] == 1) && - (param.stride[1] == 1)))) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKConvolutionOp(param); - default: - break; +} + +static bool ConvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); } } -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; + out_type->clear(); + out_type->push_back(dtype); + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ConvolutionProp::CreateOperatorEx(Context ctx, - std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + ConvolutionParam param_; + try { + param_.Init(attrs->dict); + } catch (const dmlc::ParamError& e) { + std::ostringstream os; + os << e.what(); + os << ", in operator " << attrs->op->name << "(" + << "name=\"" << attrs->name << "\""; + for (const auto& k : attrs->dict) { + os << ", " << k.first << "=\"" << k.second << "\""; + } + os << ")"; + throw dmlc::ParamError(os.str()); + } + + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); } -MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp) +struct ConvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const ConvolutionParam& param = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[conv::kData]); + heads.push_back(n->inputs[conv::kWeight]); + if (!param.no_bias) + heads.push_back(n->inputs[conv::kBias]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +NNVM_REGISTER_OP(Convolution) .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. In the 2-D convolution, given input data with shape *(batch_size, @@ -168,10 +383,52 @@ There are other options to tune the performance. the performance. )code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ConvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + if (params.no_bias) + return std::vector{"data", "weight"}; + else + return std::vector{"data", "weight", "bias"}; +}) +.set_attr("FInferShape", ConvolutionShape) +.set_attr("FInferType", ConvolutionType) +.set_attr("FInferStorageType", [](const nnvm::NodeAttrs& attrs, + const Context& ctx, std::vector *in_attrs, std::vector *out_attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + if (params.no_bias) + return ElemwiseStorageType<2, 1>(attrs, ctx, in_attrs, out_attrs); + else + return ElemwiseStorageType<3, 1>(attrs, ctx, in_attrs, out_attrs); +}) +.set_attr("FCompute", ConvolutionCompute) +.set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(ConvolutionParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Convolution) +.set_num_inputs(3) +.set_num_outputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ConvolutionParamParser) +.set_attr("FCompute", ConvolutionGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index c31d78c226f4..50b4b04ff354 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -21,43 +21,133 @@ * Copyright (c) 2017 by Contributors * \file convolution.cu * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include "./convolution-inl.h" #include +#include "./depthwise_convolution-inl.h" #if MXNET_USE_CUDNN == 1 #include "./cudnn/cudnn_convolution-inl.h" #endif // MXNET_USE_CUDNN -#include "./depthwise_convolution-inl.h" - namespace mxnet { namespace op { +// This is to maintain one copy for each type. +template +static ConvolutionOp &get_op(const ConvolutionParam& param) { + static thread_local ConvolutionOp op; + op.Init(param); + return op; +} + +template +static CuDNNConvolutionOp &get_cudnn_op(const ConvolutionParam& param, + int forward_compute_type, int backward_compute_type, + const std::vector& in_shape, const std::vector& out_shape, + const Context& ctx) { + static thread_local CuDNNConvolutionOp op; + op.Init(param, forward_compute_type, backward_compute_type, + in_shape, out_shape, ctx); + return op; +} + template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[conv::kData].type_flag_; + // If 1D convolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); }) - return op; + return; + } else if (param.num_filter == param.num_group && + param.layout.value() == mshadow::kNCHW && + param.num_filter == inputs[conv::kData].shape_[1] && + param.kernel.ndim() == 2 && + param.dilate == mshadow::Shape2(1, 1) && + dtype == mshadow::kFloat32) { + static thread_local DepthwiseConvolutionOp op; + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + op.Init(param, in_shape, out_shape); + op.Forward(ctx, inputs, req, outputs); + return; } - // depth wise conv - if (param.num_filter == param.num_group && +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + } else { + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + CuDNNConvolutionOp &op = get_cudnn_op(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Forward(ctx, inputs, req, outputs); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Forward(ctx, inputs, req, outputs); + }) +#endif // MXNET_USE_CUDNN +} + +template<> +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }) + return; + } else if (param.num_filter == param.num_group && param.layout.value() == mshadow::kNCHW && - param.num_filter == (*in_shape)[conv::kData][1] && + param.num_filter == in_data[conv::kData].shape_[1] && param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { - op = new DepthwiseConvolutionOp(param, *in_shape, *out_shape); - return op; + static thread_local DepthwiseConvolutionOp op; + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + op.Init(param, in_shape, out_shape); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + return; } #if MXNET_USE_CUDNN == 1 @@ -66,23 +156,38 @@ Operator* CreateOp(ConvolutionParam param, int dtype, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - op = new ConvolutionOp(param); - } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - op = new ConvolutionOp(param); + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else { - op = new CuDNNConvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + CuDNNConvolutionOp &op = get_cudnn_op(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); + ConvolutionOp &op = get_op(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN - return op; } +NNVM_REGISTER_OP(Convolution) +.set_attr("FCompute", ConvolutionCompute); + +NNVM_REGISTER_OP(_backward_Convolution) +.set_attr("FCompute", ConvolutionGradCompute); + } // namespace op } // namespace mxnet From 1bea1f39def4ba4e791b6c9341f4de1ecd06f264 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 20:45:03 +0000 Subject: [PATCH 003/264] Use NNVM interface for deconvolution. --- src/operator/nn/deconvolution-inl.h | 350 ++++------------------------ src/operator/nn/deconvolution.cc | 319 +++++++++++++++++++++++-- src/operator/nn/deconvolution.cu | 117 ++++++++-- 3 files changed, 447 insertions(+), 339 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index b7d2676fadf3..42ab9cb1aba9 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution-inl.h * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ @@ -195,19 +195,18 @@ namespace mxnet { namespace op { template -class DeconvolutionOp : public Operator { +class DeconvolutionOp { public: - explicit DeconvolutionOp(DeconvolutionParam p) { + void Init(DeconvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(real_t); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; @@ -311,19 +310,18 @@ class DeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); // get data @@ -456,300 +454,52 @@ class DeconvolutionOp : public Operator { }; // class DeconvolutionOp template -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class DeconvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - if (param_.adj.ndim() == 0) param_.adj = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { -#if MXNET_USE_CUDNN == 0 - if (param_.kernel.ndim() != 2) { - LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported"; - return false; - } -#endif // CUDNN - - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - out_shape->resize(1, TShape()); - const TShape &dshape = (*in_shape)[deconv::kData]; - if (dshape.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - - index_t o_pad[1]; - index_t o_adj[1]; - param_.InferPad(dshape_ncw, o_pad, o_adj); - - CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; - - Shape<3> oshape; - oshape[0] = dshape_ncw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + - dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshape.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(dshape_nchw[1], - param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - - index_t o_pad[2]; - index_t o_adj[2]; - param_.InferPad(dshape_nchw, o_pad, o_adj); - - CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; - - Shape<4> oshape; - oshape[0] = dshape_nchw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + - dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + - dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshape.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - - index_t o_pad[3]; - index_t o_adj[3]; - param_.InferPad(dshape_ncdhw, o_pad, o_adj); - - CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d deconvolution"; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; - CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; - - Shape<5> oshape; - oshape[0] = dshape_ncdhw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + - dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + - dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; - oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + - dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - if (param_.target_shape[2] > 0) { - CHECK_EQ(param_.target_shape[2], oshape[4]) \ - << "param_.target_shape[2] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DeconvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Deconvolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]}; - } +void _DeconvolutionCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + static thread_local DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionCompute(param, ctx, inputs, req, outputs); +} - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void _DeconvolutionGradCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + static thread_local DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionGradCompute(param, ctx, inputs, req, outputs); +} - private: - DeconvolutionParam param_; -}; // class DeconvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 45867f78593c..eb958154baa7 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -21,45 +21,318 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cc * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #include "./deconvolution-inl.h" namespace mxnet { namespace op { -template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); - }); - return op; + +static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); +#if MXNET_USE_CUDNN == 0 + if (param_.kernel.ndim() != 2) { + LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported"; + return false; + } +#endif // CUDNN + + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshape = (*in_shape)[deconv::kData]; + if (dshape.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + + index_t o_pad[1]; + index_t o_adj[1]; + param_.InferPad(dshape_ncw, o_pad, o_adj); + + CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; + + Shape<3> oshape; + oshape[0] = dshape_ncw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + + dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; + + if (param_.target_shape.ndim() > 0) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshape.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(dshape_nchw[1], + param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + + index_t o_pad[2]; + index_t o_adj[2]; + param_.InferPad(dshape_nchw, o_pad, o_adj); + + CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; + + Shape<4> oshape; + oshape[0] = dshape_nchw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + + dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + + dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; + + if (param_.target_shape.ndim() > 1) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshape.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + + index_t o_pad[3]; + index_t o_adj[3]; + param_.InferPad(dshape_ncdhw, o_pad, o_adj); + + CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d deconvolution"; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; + CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; + + Shape<5> oshape; + oshape[0] = dshape_ncdhw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + + dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + + dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; + oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + + dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; + + if (param_.target_shape.ndim() > 2) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + if (param_.target_shape[2] > 0) { + CHECK_EQ(param_.target_shape[2], oshape[4]) \ + << "param_.target_shape[2] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } +} + +static inline std::vector ListArguments(const DeconvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } +} + +static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; } -Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx); +static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + DeconvolutionParam param_; + param_.Init(attrs->dict); + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + if (param_.adj.ndim() == 0) param_.adj = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); } +struct DeconvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[deconv::kData]); + heads.push_back(n->inputs[deconv::kWeight]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(DeconvolutionParam); -MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp) -.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") -.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") -.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " - "operation.") -.add_arguments(DeconvolutionParam::__FIELDS__()) +NNVM_REGISTER_OP(Deconvolution) .describe("Computes 2D transposed convolution (aka fractionally strided convolution) of the " "input tensor. This operation can be seen as the gradient of Convolution operation with " "respect to its input. Convolution usually reduces the size of the input. Transposed " "convolution works the other way, going from a smaller input to a larger output while " - "preserving the connectivity pattern."); + "preserving the connectivity pattern.") +.set_num_inputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(DeconvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", DeconvolutionShape) +.set_attr("FInferType", DeconvolutionType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr("FCompute", DeconvolutionCompute) +.set_attr("FGradient", DeconvolutionGrad{"_backward_Deconvolution"}) +.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") +.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") +.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " + "operation.") +.add_arguments(DeconvolutionParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_num_outputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(DeconvolutionParamParser) +.set_attr("FCompute", DeconvolutionGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 6d0787662c64..0c2e160cf696 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cu * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #include "./deconvolution-inl.h" @@ -31,19 +31,41 @@ namespace mxnet { namespace op { + +template +static DeconvolutionOp &get_op(const DeconvolutionParam& param) { + static thread_local DeconvolutionOp op; + op.Init(param); + return op; +} + +template +static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param, + int forward_compute_type, int backward_compute_type, + const std::vector& in_shape, const std::vector& out_shape, + const Context& ctx, bool backward) { + // Convolution forward has to be called before backward for this operator. + // So we can't make this operator thread local. backward might be called + // in another thread. + static CuDNNDeconvolutionOp op; + if (!backward) + op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); + return op; +} + template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - // Logic here parallels that in Convolution.cu - Operator *op = NULL; +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[0].type_flag_; // If 1D deconvolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); + get_op(param).Forward(ctx, inputs, req, outputs); }) - return op; + return; } #if MXNET_USE_CUDNN == 1 // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). @@ -51,23 +73,86 @@ Operator* CreateOp(DeconvolutionParam param, int dtype, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - op = new DeconvolutionOp(param); - } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + get_op(param).Forward(ctx, inputs, req, outputs); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - op = new DeconvolutionOp(param); + get_op(param).Forward(ctx, inputs, req, outputs); } else { - op = new CuDNNDeconvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = inputs[i].shape_; + } + get_cudnn_op(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx, false).Forward(ctx, inputs, req, outputs); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); + get_op(param).Forward(ctx, inputs, req, outputs); + }) +#endif // MXNET_USE_CUDNN +} + +template<> +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + + // If 1D deconvolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + }) + return; + } +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << + "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); + } else { + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = in_data[i].shape_; + } + get_cudnn_op(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx, true).Backward(ctx, + std::vector{out_grad}, in_data, req, in_grad); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + get_op(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN - return op; } +NNVM_REGISTER_OP(Deconvolution) +.set_attr("FCompute", DeconvolutionCompute); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_attr("FCompute", DeconvolutionGradCompute); + } // namespace op } // namespace mxnet From 7e09fc908661bce81c039719ac5087aaccadf98c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:02:57 +0000 Subject: [PATCH 004/264] Use NNVM interface for FullyConnected. --- src/operator/nn/fully_connected-inl.h | 202 +++++++++----------------- src/operator/nn/fully_connected.cc | 118 ++++++++++----- src/operator/nn/fully_connected.cu | 52 ++++++- 3 files changed, 197 insertions(+), 175 deletions(-) diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 9f3deec2449f..07965c354930 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -66,24 +66,18 @@ struct FullyConnectedParam : public dmlc::Parameter { * \tparam xpu The device that the op will be executed on. */ template -class FullyConnectedOp : public Operator { +class FullyConnectedOp { public: - explicit FullyConnectedOp(FullyConnectedParam p) { + void Init(const FullyConnectedParam &p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; if (req[fullc::kOut] == kNullOp) return; CHECK_EQ(req[fullc::kOut], kWriteTo); - size_t expected = param_.no_bias ? 2 : 3; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1U); // TODO(bing): check the BLAS Handle, be careful // maybe need blas handle from context // TODO(bing): judge shape to remove flatten op @@ -118,19 +112,11 @@ class FullyConnectedOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const std::vector &out_grad, + const std::vector &in_data, const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); // TODO(bing): check the BLAS Handle, be careful // maybe need blas handle from context Stream *s = ctx.get_stream(); @@ -177,124 +163,80 @@ class FullyConnectedOp : public Operator { linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); } + static FullyConnectedOp &get_op(const FullyConnectedParam& param) { + static thread_local FullyConnectedOp op; + op.Init(param); + return op; + } + private: FullyConnectedParam param_; }; // class FullyConnectedOp -// Decalre Factory function, used for dispatch specialization template -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class FullyConnectedProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + break; + case mshadow::kFloat64: + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - CHECK_EQ(out_shape->size(), 1U); - TShape dshape = (*in_shape)[fullc::kData]; - TShape oshape = (*out_shape)[0]; - // require data to be known - if (dshape.ndim() == 0) return false; - - index_t num_input; - if (!param_.flatten) { - num_input = dshape[dshape.ndim()-1]; - } else { - num_input = dshape.ProdShape(1, dshape.ndim()); - } - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input)); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden)); - } - - if (!param_.flatten) { - TShape result_shape(dshape); - result_shape[dshape.ndim()-1] = param_.num_hidden; - SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); - } else { - SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); - } - if (oshape.ndim() != 0) { - dshape[0] = oshape[0]; - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); - } - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - nnvm::NodeAttrs attrs; - attrs.name = "FullyConnected"; - return ElemwiseAttr( - attrs, in_type, out_type, -1); - } - - OperatorProperty* Copy() const override { - FullyConnectedProp* fc_sym = new FullyConnectedProp(); - fc_sym->param_ = this->param_; - return fc_sym; - } - - std::string TypeString() const override { - return "FullyConnected"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{in_data[fullc::kData], in_grad[fullc::kData]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; +template +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + break; + case mshadow::kFloat64: + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - FullyConnectedParam param_; -}; // class FullyConnectedSymbol -#endif } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 9a978160297d..6524fbe349f9 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -29,52 +29,68 @@ namespace mxnet { namespace op { -template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - // nnp_fully_connected_inference will do optimization for batch-size = 1 - // nnp_fully_connected_output will do optimization for batch-size > 1 - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKFullyConnectedOp(param); - default: - break; + +static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + if (!param.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + CHECK_EQ(out_shape->size(), 1U); + TShape dshape = (*in_shape)[fullc::kData]; + TShape oshape = (*out_shape)[0]; + // require data to be known + if (dshape.ndim() == 0) return false; + + index_t num_input; + if (!param.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); } -#endif - switch (dtype) { - case mshadow::kFloat32: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat64: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat16: - LOG(FATAL) << "float16 fully connected layer is currently" - "only supported by CuDNN version."; - break; - default: - LOG(FATAL) << "Unsupported type " << dtype; + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input)); + if (!param.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden)); } - return op; + if (!param.flatten) { + TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden)); + } + if (oshape.ndim() != 0) { + dshape[0] = oshape[0]; + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); + } + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape(1, TShape()), aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_GE(in_type->size(), 1U); + return ElemwiseAttr( + attrs, in_type, out_type, -1); } +struct FullyConnectedGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[fullc::kData]); + heads.push_back(n->inputs[fullc::kWeight]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(FullyConnectedParam); -MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp) +NNVM_REGISTER_OP(FullyConnected) .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. If ``flatten`` is set to be true, then the shapes are: @@ -96,9 +112,37 @@ The learnable parameters include both ``weight`` and ``bias``. If ``no_bias`` is set to be true, then the ``bias`` term is ignored. )code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", [](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + if (!params.no_bias) { + return std::vector{"data", "weight", "bias"}; + } else { + return std::vector{"data", "weight"}; + } +}) +.set_attr("FInferShape", FullyConnectedShape) +.set_attr("FInferType", FullyConnectedType) +.set_attr("FCompute", FullyConnectedCompute) +.set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) .add_argument("data", "NDArray-or-Symbol", "Input data.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(FullyConnectedParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_num_outputs(3) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{1, 0}}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", FullyConnectedGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu index 279a378e2ad4..81bc1a75aa58 100644 --- a/src/operator/nn/fully_connected.cu +++ b/src/operator/nn/fully_connected.cu @@ -25,16 +25,52 @@ #include "./fully_connected-inl.h" namespace mxnet { namespace op { + template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new FullyConnectedOp(param); - }) - return op; + FullyConnectedOp::get_op(param).Forward(ctx, inputs, + req, outputs); + }); } + +template<> +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, + req, outputs); + }); +} + +NNVM_REGISTER_OP(FullyConnected) +.set_attr("FCompute", FullyConnectedCompute); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_attr("FCompute", FullyConnectedGradCompute); + } // namespace op } // namespace mxnet From db7097268c1766f712811528e4f17729c3572437 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:05:10 +0000 Subject: [PATCH 005/264] Move NNVM interface to batch norm. --- src/operator/nn/batch_norm-inl.h | 195 ++++++++----------------------- src/operator/nn/batch_norm.cc | 135 +++++++++++++++------ src/operator/nn/batch_norm.cu | 78 +++++++++++-- 3 files changed, 211 insertions(+), 197 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 2a9dee2cf845..b229290dd3a8 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm-inl.h * \brief - * \author Bing Xu, Chris Olivier + * \author Bing Xu, Chris Olivier, Da Zheng */ #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ @@ -47,7 +47,8 @@ namespace mxnet { namespace op { namespace batchnorm { -enum BatchNormOpInputs {kData, kGamma, kBeta}; // kGamma: weights, kBeta: biases +enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean, + kInMovingVar}; // kGamma: weights, kBeta: biases enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states @@ -87,9 +88,9 @@ struct BatchNormParam : public dmlc::Parameter { /*! \brief Batch normalization operator */ template -class BatchNormOp : public Operator { +class BatchNormOp { public: - explicit BatchNormOp(BatchNormParam param) { + void Init(BatchNormParam param) { this->param_ = param; } @@ -108,7 +109,7 @@ class BatchNormOp : public Operator { * need, epecial case like Batch Norm requires. * \sa OpReqType, OpContext */ - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -158,7 +159,7 @@ class BatchNormOp : public Operator { * \param aux_states Auxiliary states of operator. Normally operator doesn't need * \sa OperatorProperty, OpReqType, OpContext */ - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -212,151 +213,49 @@ class BatchNormOp : public Operator { BatchNormParam param_; }; // class BatchNormOp -template -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape); - -#if DMLC_USE_CXX11 -class BatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - - const size_t channelAxis = static_cast(param_.axis < 0 - ? static_cast(dshape.ndim()) + param_.axis - : param_.axis); - CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis; - - const int channelCount = dshape[channelAxis]; - - if (dshape.ndim() == 0) { - return false; - } - - in_shape->at(1) = TShape(Shape1(channelCount)); - in_shape->at(2) = TShape(Shape1(channelCount)); - - out_shape->clear(); - out_shape->push_back(dshape); // kOut - out_shape->push_back(Shape1(channelCount)); // kMean - out_shape->push_back(Shape1(channelCount)); // kVar - - aux_shape->clear(); - aux_shape->push_back(Shape1(channelCount)); // kMovingMean - aux_shape->push_back(Shape1(channelCount)); // kMovingVar - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - using namespace mshadow; - CHECK_GE(in_type->size(), 1U); - const int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - // For float16 input type beta, gamma, mean, and average are stored in float32. - // For other input types, these parameters have the same type as input - // NOTE: This requirement is from cuDNN (v. 4 and 5) - int dtype_param; - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { - dtype_param = mshadow::DataType::kFlag; }); - for (index_t i = 1; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype_param; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); - } - } - for (index_t i = 0; i < aux_type->size(); ++i) { - if ((*aux_type)[i] != -1) { - UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]); - } - } - const size_t n_aux = this->ListAuxiliaryStates().size(); - aux_type->clear(); - for (size_t i = 0; i < n_aux; ++i) { - aux_type->push_back(dtype_param); - } - const size_t n_out = this->ListOutputs().size(); - out_type->clear(); - out_type->push_back(dtype); - for (size_t i = 1; i < n_out; ++i) { - out_type->push_back(dtype_param); - } - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new BatchNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "BatchNorm"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[batchnorm::kOut], - out_data[batchnorm::kMean], - out_data[batchnorm::kVar], - in_data[batchnorm::kData], - in_data[batchnorm::kGamma] - }; - } - - int NumVisibleOutputs() const override { - if (param_.output_mean_var) { - return 3; - } - return 1; - } - - int NumOutputs() const override { - return 3; - } - - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } - - std::vector ListOutputs() const override { - return {"output", "mean", "var"}; - } - - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_var"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } +template +static BatchNormOp &GetBatchNormOp(const BatchNormParam& param) { + static thread_local BatchNormOp op; + op.Init(param); + return op; +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Forward(ctx, in_data, + req, outputs, aux_states); + }); +} - inline const BatchNormParam& getParam() const { - return param_; - } +template +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + std::vector out_grad(inputs.begin(), + inputs.begin() + (param.output_mean_var ? 3U : 1U)); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }); +} - private: - BatchNormParam param_; -}; // class BatchNormProp +#if DMLC_USE_CXX11 namespace batchnorm { diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index ca2883239488..9ce4febd3eef 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -21,16 +21,17 @@ * Copyright (c) 2015 by Contributors * \file batch_norm.cc * \brief - * \author Bing Xu, Chris Olivier + * \author Bing Xu, Chris Olivier, Da Zheng */ #include "batch_norm-inl.h" -#include #if MXNET_USE_MKL2017 == 1 #include #include "../mkl/mkl_memory-inl.h" #include "../mkl/mkl_batch_norm-inl.h" #endif // MXNET_USE_MKL2017 +#include +#include "../elemwise_op_common.h" /*! \brief inverse standard deviation <-> variance */ #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) @@ -314,45 +315,76 @@ void BatchNormOp::DoBackward(mshadow::Stream *, } } -template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { - param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = nullptr; -#if MXNET_USE_MKL2017 == 1 - if (shape.ndim() == 4 - && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS - && !mxnet::op::batchnorm::disable_mkl) { - switch (dtype) { - case mshadow::kFloat32: - op = new MKLBatchNormOp(param); - break; - case mshadow::kFloat64: - op = new MKLBatchNormOp(param); - break; - default: - // MKL operator doesn't support half_t, so fall through - break; - } - } -#endif - if (!op) { - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, { - op = new BatchNormOp(param); }); +DMLC_REGISTER_PARAMETER(BatchNormParam); + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; + const TShape &dshape = in_shape->at(0); + + const size_t channelAxis = static_cast(param.axis < 0 + ? static_cast(dshape.ndim()) + param.axis + : param.axis); + CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis; + + const int channelCount = dshape[channelAxis]; + + if (dshape.ndim() == 0) { + return false; } - return op; + + in_shape->at(1) = TShape(Shape1(channelCount)); + in_shape->at(2) = TShape(Shape1(channelCount)); + in_shape->at(3) = TShape(Shape1(channelCount)); // kMovingMean + in_shape->at(4) = TShape(Shape1(channelCount)); // kMovingVar + + out_shape->clear(); + out_shape->push_back(dshape); // kOut + out_shape->push_back(Shape1(channelCount)); // kMean + out_shape->push_back(Shape1(channelCount)); // kVar + + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); +static inline std::vector ListArguments() { + return {"data", "gamma", "beta"}; } -DMLC_REGISTER_PARAMETER(BatchNormParam); +static inline std::vector ListOutputs() { + return {"output", "mean", "var"}; +} + +static bool BatchNormType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + using namespace mshadow; + CHECK_GE(in_type->size(), 1U); + const int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + // For float16 input type beta, gamma, mean, and average are stored in float32. + // For other input types, these parameters have the same type as input + // NOTE: This requirement is from cuDNN (v. 4 and 5) + int dtype_param; + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { + dtype_param = mshadow::DataType::kFlag; }); + for (index_t i = 1; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype_param; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); + } + } + const size_t n_out = ListOutputs().size(); + out_type->clear(); + out_type->push_back(dtype); + for (size_t i = 1; i < n_out; ++i) { + out_type->push_back(dtype_param); + } + return true; +} -MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp) +NNVM_REGISTER_OP(BatchNorm) .describe(R"code(Batch normalization. Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as @@ -398,14 +430,35 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr then set ``gamma`` to 1 and its gradient to 0. )code" ADD_FILELINE) +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + return param.output_mean_var ? 3 : 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FInferType", BatchNormType) +.set_attr("FCompute", BatchNormCompute) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") .add_argument("beta", "NDArray-or-Symbol", "beta array") .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") -.add_arguments(BatchNormParam::__FIELDS__()); - -NNVM_REGISTER_OP(BatchNorm) +.add_arguments(BatchNormParam::__FIELDS__()) .set_attr( "FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { @@ -417,5 +470,11 @@ NNVM_REGISTER_OP(BatchNorm) } }); +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_num_outputs(5) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 59317b7fa837..076ea3034b23 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm.cu * \brief CUDA Batch Normalization code - * \author Chris Olivier, Bing Xu + * \author Chris Olivier, Bing Xu, Da Zheng * Adapted from Torch */ #include @@ -637,30 +637,86 @@ void BatchNormOp::DoBackward(mshadow::Stream *stream, MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu); } -/*! \brief Create GPU operator for batch normalization */ +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { + static thread_local CuDNNBatchNormOp op; + op.Init(param); + return op; +} + template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + BatchNormParam param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = NULL; #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNBatchNormOp(param); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - op = new BatchNormOp(param); + GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); }) } #else - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, - { op = new BatchNormOp(param); }); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); + }); #endif - return op; } +template<> +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + BatchNormParam param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 + && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); + }) + } else { + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }) + } +#else + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + GetBatchNormOp(param).Backward(ctx, out_grad, + in_data, out_data, req, in_grad, aux_states); + }); +#endif +} + +NNVM_REGISTER_OP(BatchNorm) +.set_attr("FCompute", BatchNormCompute); + +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet From e8ee540d4bd24cd5e93315991af9f377f416830c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:06:23 +0000 Subject: [PATCH 006/264] Use NNVM interface for depthwise convolution. --- src/operator/nn/depthwise_convolution-inl.h | 36 ++++++++------------ src/operator/nn/depthwise_convolution_tf.cuh | 6 ++-- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h index c4b7a4787554..0af8cae51c84 100644 --- a/src/operator/nn/depthwise_convolution-inl.h +++ b/src/operator/nn/depthwise_convolution-inl.h @@ -39,11 +39,11 @@ namespace mxnet { namespace op { using namespace tf::depthwise_conv; template -class DepthwiseConvolutionOp : public Operator { +class DepthwiseConvolutionOp { public: - explicit DepthwiseConvolutionOp(const ConvolutionParam& param, - const std::vector& in_shape, - const std::vector& out_shape) { + void Init(const ConvolutionParam& param, + const std::vector& in_shape, + const std::vector& out_shape) { args_.batch = in_shape[conv::kData][0]; args_.in_channel = in_shape[conv::kData][1]; args_.in_height = in_shape[conv::kData][2]; @@ -62,19 +62,16 @@ class DepthwiseConvolutionOp : public Operator { ~DepthwiseConvolutionOp() {} - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args); + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args); + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &req, + const std::vector &in_grad); private: DepthwiseArgs args_; @@ -282,8 +279,7 @@ template void DepthwiseConvolutionOp::Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); @@ -305,10 +301,8 @@ template void DepthwiseConvolutionOp::Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh index c7f48e686136..e4dfd8292d2d 100644 --- a/src/operator/nn/depthwise_convolution_tf.cuh +++ b/src/operator/nn/depthwise_convolution_tf.cuh @@ -24,8 +24,8 @@ * are different with origin version. * \author shuqian.qu@hobot.cc */ -#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ -#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ #include "../../common/cuda_utils.h" #include "../mxnet_op.h" @@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream } // namespace depthwise_conv } // namespace tf -#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#endif // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ From 0e3a16305f3b2fc3e3d04a7fad9f76577ea3c22e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:09:43 +0000 Subject: [PATCH 007/264] Use NNVM interface for softmax activation. --- src/operator/nn/softmax_activation-inl.h | 155 ++++++++--------------- src/operator/nn/softmax_activation.cc | 30 +++-- src/operator/nn/softmax_activation.cu | 52 +++++++- 3 files changed, 116 insertions(+), 121 deletions(-) diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h index 500bf51ccd1f..5d0e937e218d 100644 --- a/src/operator/nn/softmax_activation-inl.h +++ b/src/operator/nn/softmax_activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation-inl.h * \brief SoftmaxActivation operator - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ @@ -66,71 +66,56 @@ struct SoftmaxActivationParam : public dmlc::Parameter { * \tparam xpu The device that the op will be executed on. */ template -class SoftmaxActivationOp : public Operator { +class SoftmaxActivationOp { public: - explicit SoftmaxActivationOp(SoftmaxActivationParam p) { + void Init(SoftmaxActivationParam p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); if (param_.mode == softmax_activation::kInstance) { - Tensor data = in_data[softmax_activation::kData].FlatTo2D(s); - Tensor out = out_data[softmax_activation::kOut].FlatTo2D(s); + Tensor data = in_data.FlatTo2D(s); + Tensor out = out_data.FlatTo2D(s); Softmax(out, data); } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_data.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; - int n = in_data[softmax_activation::kData].size(0); - int k = in_data[softmax_activation::kData].size(1); - Shape<3> s3 = Shape3(n, k, static_cast(in_data[softmax_activation::kData].Size()/n/k)); - Tensor data = - in_data[softmax_activation::kData].get_with_shape(s3, s); - Tensor out = - out_data[softmax_activation::kOut].get_with_shape(s3, s); + int n = in_data.size(0); + int k = in_data.size(1); + Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); + Tensor data = in_data.get_with_shape(s3, s); + Tensor out = out_data.get_with_shape(s3, s); Softmax(out, data); } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); // Use 3d tensor for both mode -> {instance, channel}. Get shapes - int total_size = in_grad[softmax_activation::kData].Size(); - int batch_size = in_grad[softmax_activation::kData].shape_[0]; - int channel_num = in_grad[softmax_activation::kData].shape_[1]; + int total_size = in_grad.Size(); + int batch_size = in_grad.shape_[0]; + int channel_num = in_grad.shape_[1]; int rest_size = total_size / (batch_size * channel_num); const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); // Get tensors Stream *s = ctx.get_stream(); Tensor m_out_grad = - out_grad[softmax_activation::kOut].get_with_shape(data_shape, s); + out_grad.get_with_shape(data_shape, s); Tensor m_out_data = - out_data[softmax_activation::kOut].get_with_shape(data_shape, s); + out_data.get_with_shape(data_shape, s); Tensor m_in_grad = - in_grad[softmax_activation::kData].get_with_shape(data_shape, s); + in_grad.get_with_shape(data_shape, s); // get requested temp space Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( Shape2(batch_size, rest_size), s); workspace = reduce_with_axis(m_out_grad * m_out_data, 1); - Assign(m_in_grad, req[softmax_activation::kData], + Assign(m_in_grad, req, m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); } @@ -138,76 +123,38 @@ class SoftmaxActivationOp : public Operator { SoftmaxActivationParam param_; }; // class SoftmaxActivationOp -// Decalre Factory function, used for dispatch specialization -template -Operator* CreateOp(SoftmaxActivationParam type); - -#if DMLC_USE_CXX11 -class SoftmaxActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(softmax_activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new SoftmaxActivationProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "SoftmaxActivation"; - } - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}}; - } +template +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); +} - Operator* CreateOperator(Context ctx) const override; +template +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +} - private: - SoftmaxActivationParam param_; -}; -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc index 657b382c6e03..bdfd8b065de1 100644 --- a/src/operator/nn/softmax_activation.cc +++ b/src/operator/nn/softmax_activation.cc @@ -21,26 +21,18 @@ * Copyright (c) 2015 by Contributors * \file activation.cc * \brief softmax_activation op - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./softmax_activation-inl.h" +#include "../tensor/elemwise_unary_op.h" #include "../mshadow_op.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(SoftmaxActivationParam param) { - return new SoftmaxActivationOp(param); -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const { - DO_BIND_DISPATCH(CreateOp, param_); -} DMLC_REGISTER_PARAMETER(SoftmaxActivationParam); -MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp) +MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation) .describe(R"code(Applies softmax activation to input. This is intended for internal layers. .. note:: @@ -65,8 +57,22 @@ Example:: [ 6.56221947e-03 5.95310994e-04 9.73919690e-01 1.78379621e-02 1.08472735e-03]] )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationCompute) +.set_attr("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"}) .add_arguments(SoftmaxActivationParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index 0810483e1262..9aba20ece514 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation.cu * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./softmax_activation-inl.h" #include "../mshadow_op.h" @@ -31,14 +31,56 @@ namespace mxnet { namespace op { + template<> -Operator *CreateOp(SoftmaxActivationParam param) { +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + #if MXNET_USE_CUDNN == 1 - return new CuDNNSoftmaxActivationOp(param); + static thread_local CuDNNSoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); #else - return new SoftmaxActivationOp(param); -#endif // MXNET_USE_CUDNN + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Forward(ctx, inputs[0], req[0], outputs[0]); +#endif } + +template<> +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + +#if MXNET_USE_CUDNN == 1 + static thread_local CuDNNSoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +#else + static thread_local SoftmaxActivationOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +#endif +} + +NNVM_REGISTER_OP(SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationCompute); + +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationGradCompute); + } // namespace op } // namespace mxnet From f32be0986d773c856f644a2d5b7a9fd3ba7a5a0c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:11:34 +0000 Subject: [PATCH 008/264] Use NNVM interface for pooling. --- src/operator/nn/pooling-inl.h | 273 ++++++++-------------------------- src/operator/nn/pooling.cc | 188 +++++++++++++++++------ src/operator/nn/pooling.cu | 81 ++++++++-- 3 files changed, 270 insertions(+), 272 deletions(-) diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index a32aaa2152e9..015b83c4fbc6 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling-inl.h * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_ @@ -81,254 +81,97 @@ struct PoolingParam : public dmlc::Parameter { }; template -class PoolingOp : public Operator { +class PoolingOp { public: - explicit PoolingOp(PoolingParam p) { + void Init(PoolingParam p) { this->param_ = p; } - virtual void Forward(const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data, - const std::vector& aux_args) { + void Forward(const OpContext& ctx, const TBlob& in_data, + const OpReqType& req, const TBlob& out_data) { using namespace mshadow; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; + const TShape& ishape = in_data.shape_; - pool(s, in_data[pool_enum::kData].dptr(), - in_data[pool_enum::kData].shape_, - out_data[pool_enum::kOut].shape_, + pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, param_.global_pool? TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) : param_.kernel, param_.pad, param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kOut], - out_data[pool_enum::kOut].dptr()); + param_.pool_type, req, out_data.dptr()); } - virtual void Backward(const OpContext& ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { + void Backward(const OpContext& ctx, const TBlob& out_grad, + const TBlob& in_data, const TBlob& out_data, + const OpReqType& req, const TBlob& in_grad) { using namespace mshadow; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; + const TShape& ishape = in_data.shape_; - unpool(s, out_grad[pool_enum::kOut].dptr(), - in_data[pool_enum::kData].dptr(), - out_data[pool_enum::kOut].dptr(), - in_grad[pool_enum::kData].shape_, - out_grad[pool_enum::kOut].shape_, + unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), + in_grad.shape_, out_grad.shape_, param_.global_pool? TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) : param_.kernel, param_.pad, param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kData], - in_grad[pool_enum::kData].dptr()); + param_.pool_type, req, in_grad.dptr()); } private: PoolingParam param_; }; // class PoolingOp -template -Operator* CreateOp(PoolingParam param, int dtype); - +template +PoolingOp &GetPoolingOp(const PoolingParam ¶m) { + static thread_local PoolingOp op; + op.Init(param); + return op; +} -#if DMLC_USE_CXX11 -class PoolingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); +template +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) - << "stride and kernel should have the same length"; - CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) - << "pad and kernel should have the same length"; - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" - << " Or 4D in (batch, channel, y, x) " - << " Or 5D in (batch, channel, d, y, x)"; - TShape oshape = dshape; - if (dshape.ndim() == 0) return false; - if (param_.kernel.ndim() == 1) { - CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; - if (param_.global_pool) { - oshape[2] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 2) { - CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) - << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] - << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 3) { - CHECK_EQ(dshape.ndim(), 5U) - << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; - CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - oshape[4] = 1; - } else { - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / - param_.stride[2]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - oshape[4] = 1 + static_cast(ceil(static_cast( - dshape[4] + 2 * param_.pad[2] - - param_.kernel[2]) / param_.stride[2])); - } - } - - out_shape->clear(); - out_shape->push_back(oshape); // save output shape + LOG(FATAL) << "unknown pooling type"; } - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = (*in_type)[0]; + }); +} - if (dtype == -1) { - LOG(FATAL) << "Input type to pooling is not specified."; - return false; +template +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; } + }); +} - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - PoolingProp *prop_sym = new PoolingProp(); - prop_sym->param_ = this->param_; - return prop_sym; - } - - std::string TypeString() const override { - return "Pooling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], - out_data[pool_enum::kOut]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { -#if MXNET_USE_CUDNN == 1 - return {}; -#else - return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}}; -#endif - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - PoolingParam param_; -}; // class PoolingProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 8345ea3886d4..3c30e1924323 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -21,9 +21,10 @@ * Copyright (c) 2017 by Contributors * \file pooling.cc * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include "./pooling-inl.h" +#include "../elemwise_op_common.h" #if MXNET_USE_MKL2017 == 1 #include #include "../mkl/mkl_memory-inl.h" @@ -36,62 +37,137 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.kernel.ndim() == 2 - && ((param.pool_type == pool_enum::kMaxPooling) - || (param.pool_type == pool_enum::kAvgPooling))) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLPoolingOp(param); - case mshadow::kFloat64: - return new MKLPoolingOp(param); - default: - break; +static void PoolingParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + PoolingParam param_; + param_.Init(attrs->dict); + if (param_.kernel.ndim() == 1) { + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) + << "pad and kernel should have the same length"; + attrs->parsed = std::move(param_); +} + +static bool PoolingShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const PoolingParam& param_ = nnvm::get(attrs.parsed); + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param_.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param_.global_pool) { + oshape[2] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); } } -#endif -#if MXNET_USE_NNPACK == 1 - // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention - // = kFull(note that the default value is kValid in MXNet) - if ((param.pool_type == pool_enum::kMaxPooling) - && (param.pooling_convention == pool_enum::kFull) - && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2) - && (param.kernel[0] == 2) && (param.kernel[1] == 2) - && (param.stride[0] == 2) && (param.stride[1] == 2)) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKPoolingOp(param); - default: - break; + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) + << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] + << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + } } - } -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5U) + << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; } else { - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + oshape[4] = 1 + static_cast(ceil(static_cast( + dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / param_.stride[2])); + } } - }); - return op; + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); -} +struct PoolingGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[pool_enum::kOut]); + heads.push_back(n->inputs[pool_enum::kData]); + heads.emplace_back(nnvm::NodeEntry{n, pool_enum::kOut, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; DMLC_REGISTER_PARAMETER(PoolingParam); -MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) +NNVM_REGISTER_OP(Pooling) .describe(R"code(Performs pooling on the input. The shapes for 1-D pooling are @@ -131,8 +207,28 @@ For 3-D pooling, an additional *depth* dimension is added before height, width)*. )code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr_parser(PoolingParamParser) +.set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferShape", PoolingShape) +.set_attr("FCompute", PoolingCompute) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_Pooling"}) .add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") .add_arguments(PoolingParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Pooling) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ +#if MXNET_USE_CUDNN == 1 + return std::vector >(); +#else + return std::vector >{{1, 0}}; +#endif +}) +.set_attr_parser(PoolingParamParser) +.set_attr("FCompute", PoolingGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index dcebe6798263..4d5c68f7ca6b 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling.cu * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include #include "./pooling-inl.h" @@ -32,38 +32,97 @@ namespace mxnet { namespace op { +#if MXNET_USE_CUDNN == 1 +template +static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { + static thread_local CuDNNPoolingOp op; + op.Init(param); + return op; +} +#endif + template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + #if MXNET_USE_CUDNN == 1 if (!param.cudnn_off && param.kernel.ndim() > 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { switch (param.pool_type) { case pool_enum::kMaxPooling: - op = new CuDNNPoolingOp(param); - break; case pool_enum::kAvgPooling: - op = new CuDNNPoolingOp(param); + GetCuDNNPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + return; + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; break; + } + }); + } +#endif // MXNET_USE_CUDNN + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +template<> +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const PoolingParam& param = nnvm::get(attrs.parsed); + +#if MXNET_USE_CUDNN == 1 + if (!param.cudnn_off && param.kernel.ndim() > 1) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + case pool_enum::kAvgPooling: + GetCuDNNPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + return; case pool_enum::kSumPooling: LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; break; } }); } - if (op) return op; #endif // MXNET_USE_CUDNN - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); + GetPoolingOp(param).Backward(ctx, + inputs[0], inputs[1], inputs[2], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } }); - return op; } +NNVM_REGISTER_OP(Pooling) +.set_attr("FCompute", PoolingCompute); + +NNVM_REGISTER_OP(_backward_Pooling) +.set_attr("FCompute", PoolingGradCompute); + } // namespace op } // namespace mxnet From 2e7284d70d97e6c79ad8338f1c21ab7ff2a3e1f7 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:25:26 +0000 Subject: [PATCH 009/264] use NNVM interface for dropout. --- src/operator/nn/dropout-inl.h | 166 +++++++++------------------------- src/operator/nn/dropout.cc | 87 +++++++++++++++--- src/operator/nn/dropout.cu | 17 ++-- 3 files changed, 122 insertions(+), 148 deletions(-) diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 4c8a5eee68ce..4a9228f9a14e 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file dropout-inl.h * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_ @@ -93,18 +93,15 @@ struct DropoutParam : public dmlc::Parameter { }; // struct DropoutParam template -class DropoutOp : public Operator { +class DropoutOp { public: - explicit DropoutOp(DropoutParam param) { + void Init(const DropoutParam ¶m) { this->pkeep_ = 1.0f - param.p; this->mode_ = param.mode; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), 1U); @@ -138,21 +135,14 @@ class DropoutOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data_mask, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); - Tensor grad = out_grad[dropout::kOut].FlatTo2D(s); - Tensor mask = out_data[dropout::kMask].FlatTo2D(s); - Tensor gdata = in_grad[dropout::kData].FlatTo2D(s); + Tensor grad = out_grad.FlatTo2D(s); + Tensor mask = out_data_mask.FlatTo2D(s); + Tensor gdata = in_grad.FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) DType* ingradptr = gdata.dptr_; @@ -165,11 +155,10 @@ class DropoutOp : public Operator { ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1; } #else // USE_MKL && _OPENMP - CHECK_EQ(grad.shape_.Size(), mask.shape_.Size()); - Assign(gdata, req[dropout::kData], grad * mask); + Assign(gdata, req, grad * mask); #endif // USE_MKL && _OPENMP } else { - Assign(gdata, req[dropout::kData], F(grad)); + Assign(gdata, req, F(grad)); } } @@ -178,111 +167,38 @@ class DropoutOp : public Operator { int mode_; }; // class DropoutOp - template -Operator *CreateOp(DropoutParam param, int dtype); - -#if DMLC_USE_CXX11 -class DropoutProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = in_type->at(0); - - if (dtype == -1) { - LOG(FATAL) << "input type to dropout is not specified."; - return false; - } - - size_t nout = this->ListOutputs().size(); - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DropoutProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Dropout"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[dropout::kOut], out_data[dropout::kMask]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[dropout::kOut], in_grad[dropout::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[dropout::kData], out_data[dropout::kOut]}}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kRandom}; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 2; - } - - std::vector ListOutputs() const override { - return {"output", "mask"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } +void DropoutCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DropoutGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); + }); +} - private: - DropoutParam param_; -}; // class DropoutProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DROPOUT_INL_H_ diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc index bbf5e2dea25b..b28113ce9322 100644 --- a/src/operator/nn/dropout.cc +++ b/src/operator/nn/dropout.cc @@ -21,31 +21,32 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./dropout-inl.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} -// DO_BIND_DISPATCH comes from operator_common.h -Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +struct DropoutGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[0]); + heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +std::vector ListOutputs() { + return std::vector{"output", "mask"}; } DMLC_REGISTER_PARAMETER(DropoutParam); -MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp) +NNVM_REGISTER_OP(Dropout) .describe(R"(Applies dropout operation to input array. - During training, each element of the input is set to zero with probability p. @@ -76,8 +77,66 @@ Example:: [[ 3. 0.5 -0.5 2. 7. ] [ 2. -0.4 7. 3. 0.2 ]] )" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return ListOutputs(); +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FInferShape", [](const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape){ + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; +}) +.set_attr("FInferType", [](const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_EQ(in_type->size(), 1U); + int dtype = in_type->at(0); + + if (dtype == -1) { + LOG(FATAL) << "input type to dropout is not specified."; + return false; + } + + size_t nout = ListOutputs().size(); + out_type->clear(); + for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); + return true; +}) +.set_attr("FCompute", DropoutCompute) +.set_attr("FGradient", DropoutGrad{"_backward_Dropout"}) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kRandom}; +}) .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.") .add_arguments(DropoutParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Dropout) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FCompute", DropoutGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index f416c5883203..e655278822a4 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -21,21 +21,20 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./dropout-inl.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} + +NNVM_REGISTER_OP(Dropout) +.set_attr("FCompute", DropoutCompute); + +NNVM_REGISTER_OP(_backward_Dropout) +.set_attr("FCompute", DropoutGradCompute); + } // namespace op } // namespace mxnet From a80724dc10d250d29ed002411230721cac066a54 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:51:41 +0000 Subject: [PATCH 010/264] Use NNVM interface for activation. --- src/operator/nn/activation-inl.h | 241 +++++++++++++------------------ src/operator/nn/activation.cc | 78 ++++------ src/operator/nn/activation.cu | 86 +++++++---- 3 files changed, 189 insertions(+), 216 deletions(-) diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index ac8b747f0f39..c46c33d6dfda 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file activation-inl.h * \brief Activation operator - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_ @@ -37,6 +37,7 @@ #include #include "../operator_common.h" #include "../mxnet_op.h" +#include "../mshadow_op.h" namespace mxnet { namespace op { @@ -61,158 +62,114 @@ struct ActivationParam : public dmlc::Parameter { } }; -/** - * \brief This is the implementation of activation operator. - * \tparam xpu The device that the op will be executed on. - */ template -class ActivationOp : public Operator { - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& input = in_data[activation::kData]; - const size_t sz = input.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, sz, - out_data[activation::kOut].dptr(), - input.dptr()); - }); - } +void ActivationForward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + const size_t sz = input.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, sz, + out_data.dptr(), + in_data.dptr()); + }); } +} - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& m_out_grad = out_grad[activation::kOut]; - const TBlob& m_out_data = out_data[activation::kOut]; - const TBlob& m_in_grad = in_grad[activation::kData]; - const size_t sz = m_out_data.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { - mxnet_op::Kernel, Req>, xpu>::Launch( - s, sz, - m_in_grad.dptr(), - m_out_grad.dptr(), - m_out_data.dptr()); - }); - } +template +void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, + const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + const size_t sz = out_data.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { + mxnet_op::Kernel, Req>, xpu>::Launch( + s, sz, + in_grad.dptr(), + out_grad.dptr(), + out_data.dptr()); + }); } -}; // class ActivationOp +} -// Declare Factory function, used for dispatch specialization template -Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape); - -#if DMLC_USE_CXX11 -class ActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + ActivationForward( + ctx, inputs[0], req[0], outputs[0]); + break; + case activation::kSigmoid: + ActivationForward( + ctx, inputs[0], req[0], outputs[0]); + break; + case activation::kTanh: + ActivationForward( + ctx, inputs[0], req[0], outputs[0]); + break; + case activation::kSoftReLU: + ActivationForward( + ctx, inputs[0], req[0], outputs[0]); + break; + default: + LOG(FATAL) << "unknown activation type"; } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ActivationProp(); - ptr->param_ = param_; - return ptr; - } + }); +} - std::string TypeString() const override { - return "Activation"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { +template +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { #if MXNET_USE_CUDNN == 1 - return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]}; + CHECK_EQ(inputs.size(), 3U); #else - return {out_grad[activation::kOut], out_data[activation::kOut]}; -#endif // MXNET_USE_CUDNN - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[activation::kOut], in_grad[activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[activation::kData], out_data[activation::kOut]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + CHECK_EQ(inputs.size(), 2U); +#endif + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + ActivationBackward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + break; + case activation::kSigmoid: + ActivationBackward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + break; + case activation::kTanh: + ActivationBackward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + break; + case activation::kSoftReLU: + ActivationBackward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + break; + default: + LOG(FATAL) << "unknown activation type"; + } + }); +} - private: - ActivationParam param_; -}; -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_ACTIVATION_INL_H_ diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 401a9e3eaa56..c437b685ddc6 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -21,10 +21,11 @@ * Copyright (c) 2015 by Contributors * \file activation.cc * \brief activation op - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./activation-inl.h" #include "../mshadow_op.h" +#include "../tensor/elemwise_unary_op.h" #if MXNET_USE_MKL2017 == 1 #include #include "../mkl/mkl_memory-inl.h" @@ -33,53 +34,24 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.act_type == activation::kReLU && dshape.ndim() <= 4) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLReluOp(); - case mshadow::kFloat64: - return new MKLReluOp(); - default: - break; - } - } - if (enableMKLWarnGenerated()) - LOG(INFO) << MKLReluOp::getName() << " Skip MKL optimization"; -#endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - case activation::kSoftReLU: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation type"; - } - }) - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); -} DMLC_REGISTER_PARAMETER(ActivationParam); -MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp) +// This will determine the order of the inputs for backward computation. +struct ActivationGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0}); +#if MXNET_USE_CUDNN == 1 + heads.push_back(n->inputs[activation::kData]); +#endif + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +MXNET_OPERATOR_REGISTER_UNARY(Activation) .describe(R"code(Applies an activation function element-wise to the input. The following activation functions are supported: @@ -90,8 +62,22 @@ The following activation functions are supported: - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") +.set_attr_parser(ParamParser) +.set_attr("FCompute", ActivationCompute) +.set_attr("FGradient", ActivationGrad{"_backward_Activation"}) .add_arguments(ActivationParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Activation) +.set_num_inputs(3) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInferShape", ElemwiseShape<3, 1>) +.set_attr("FInferType", ElemwiseType<3, 1>) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", ActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index c2f6be9f37c8..0dea6c3bb5a4 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -31,39 +31,69 @@ namespace mxnet { namespace op { + +#if MXNET_USE_CUDNN == 1 + +template +static CuDNNActivationOp &get_cudnn_op(const ActivationParam& param) { + static thread_local CuDNNActivationOp cudnn_op; + cudnn_op.Init(param); + return cudnn_op; +} + template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + // SoftReLU not supported by CUDNN yet if (param.act_type == activation::kSoftReLU) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ActivationOp(); - }) - return op; + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_activation_op().Forward(ctx, + inputs[0], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Forward(ctx, inputs[0], req[0], outputs[0]); + }); } +} -#if MXNET_USE_CUDNN == 1 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNActivationOp(param); - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation"; - } - }) -#endif // MXNET_USE_CUDNN - return op; +template<> +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + + // SoftReLU not supported by CUDNN yet + if (param.act_type == activation::kSoftReLU) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_activation_op().Backward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]); + }); + } } +#endif + +NNVM_REGISTER_OP(Activation) +.set_attr("FCompute", ActivationCompute); + +NNVM_REGISTER_OP(_backward_Activation) +.set_attr("FCompute", ActivationGradCompute); + } // namespace op } // namespace mxnet From 072c734082ef881d6a4593ea6d1d7717298b18ef Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:57:31 +0000 Subject: [PATCH 011/264] Use NNVM interface for CuDNN batch norm. --- src/operator/nn/cudnn/cudnn_batch_norm-inl.h | 158 +++++-------------- src/operator/nn/cudnn/cudnn_batch_norm.cc | 104 +++++++++--- src/operator/nn/cudnn/cudnn_batch_norm.cu | 54 ++++++- 3 files changed, 166 insertions(+), 150 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h index 3dc9c8353a35..e2337049060e 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h +++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h @@ -43,28 +43,30 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar}; #if defined(__CUDACC__) template -class CuDNNBatchNormOp : public Operator { +class CuDNNBatchNormOp { public: - explicit CuDNNBatchNormOp(BatchNormParam param) { + CuDNNBatchNormOp() { using namespace mshadow; - CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) - << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; - this->param_ = param; - init_cudnn_ = false; dtype_ = DataType::kCudnnFlag; // For float16 input type beta, gamma, mean, and average are stored in float32. // For other input types, these parameters have the same type as input dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType::kFlag; + CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); + } + + void Init(const BatchNormParam ¶m) { + CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) + << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; + this->param_ = param; } ~CuDNNBatchNormOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -84,29 +86,7 @@ class CuDNNBatchNormOp : public Operator { CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2); CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4); - if (!init_cudnn_) { - for (int i = 0; i < 4; ++i) { - if (i < in_data[cudnnbatchnorm::kData].ndim()) { - shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i]; - } else { - shape_[i] = 1; - } - } - CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - shape_[0], - shape_[1], - shape_[2], - shape_[3])); - CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, - io_desc_, - CUDNN_BATCHNORM_SPATIAL)); - init_cudnn_ = true; - } - + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -177,7 +157,7 @@ class CuDNNBatchNormOp : public Operator { }) } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -193,6 +173,7 @@ class CuDNNBatchNormOp : public Operator { CHECK(ctx.is_train && !param_.use_global_stats) << "use global statistics is not yet supported in CuDNNBatchNorm"; + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -290,7 +271,27 @@ class CuDNNBatchNormOp : public Operator { } private: - bool init_cudnn_; + void Init(const TBlob &in_data) { + for (int i = 0; i < 4; ++i) { + if (i < in_data.ndim()) { + shape_[i] = in_data.shape_[i]; + } else { + shape_[i] = 1; + } + } + + CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + shape_[0], + shape_[1], + shape_[2], + shape_[3])); + CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, + io_desc_, + CUDNN_BATCHNORM_SPATIAL)); + } + cudnnDataType_t dtype_; int dtype_param_; cudnnTensorDescriptor_t io_desc_, mean_desc_; @@ -299,91 +300,6 @@ class CuDNNBatchNormOp : public Operator { }; #endif // defined(__CUDACC__) -template -Operator *CreateOp_CuDNNv4(BatchNormParam param); - - -#if DMLC_USE_CXX11 -class CuDNNBatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - in_shape->at(1) = TShape(Shape1(dshape[1])); - in_shape->at(2) = TShape(Shape1(dshape[1])); - - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(Shape1(dshape[1])); - out_shape->push_back(Shape1(dshape[1])); - - aux_shape->clear(); - aux_shape->push_back(Shape1(dshape[1])); - aux_shape->push_back(Shape1(dshape[1])); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new CuDNNBatchNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "CuDNNBatchNorm"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[cudnnbatchnorm::kOut], - out_data[cudnnbatchnorm::kMean], - out_data[cudnnbatchnorm::kInvVar], - in_data[cudnnbatchnorm::kData], - in_data[cudnnbatchnorm::kGamma] - }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 3; - } - - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } - - std::vector ListOutputs() const override { - return {"output", "mean", "inv_var"}; - } - - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_inv_var"}; - } - - Operator* CreateOperator(Context ctx) const override; - - private: - BatchNormParam param_; -}; // class CuDNNBatchNormProp - -#endif // DMLC_USE_CXX11 #endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc index e1e0c999b1fb..f1d229dd5421 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cc +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc @@ -21,46 +21,100 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cc * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./cudnn_batch_norm-inl.h" #include +#include "../../elemwise_op_common.h" namespace mxnet { namespace op { -#if CUDNN_MAJOR >= 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + in_shape->at(1) = TShape(Shape1(dshape[1])); + in_shape->at(2) = TShape(Shape1(dshape[1])); + in_shape->at(3) = TShape(Shape1(dshape[1])); + in_shape->at(4) = TShape(Shape1(dshape[1])); + + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(Shape1(dshape[1])); + out_shape->push_back(Shape1(dshape[1])); + + return true; +} + +static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; - return NULL; } -Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const { -#if CUDNN_MAJOR >= 5 - LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." - "Use the later instead."; - return nullptr; -#else - DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_); -#endif +static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; } -MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp) +NNVM_REGISTER_OP(CuDNNBatchNorm) .describe("Apply batch normalization to input.") +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FCompute", BatchNormCompute_CPU) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") -.add_arguments(BatchNormParam::__FIELDS__()); +.add_argument("gamma", "NDArray-or-Symbol", "gamma array") +.add_argument("beta", "NDArray-or-Symbol", "beta array") +.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") +.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") +.add_arguments(BatchNormParam::__FIELDS__()) +.set_attr( + "FSetInputVarAttrOnCompose", + [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { + if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; + if (index == 3) { + var->attrs.dict["__init__"] = "[\"zero\", {}]"; + } else if (index == 4) { + var->attrs.dict["__init__"] = "[\"one\", {}]"; + } + }); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_num_outputs(5) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{6, 7}; +}) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FCompute", BatchNormGradCompute_CPU); -NNVM_REGISTER_OP(CuDNNBatchNorm) -.set_attr("FSetInputVarAttrOnCompose", - [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { - if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; - if (index == 3) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } else if (index == 4) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } - }); #endif // CUDNN_MAJOR >= 4 + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu index e96db2e5e73f..c929ab2e2878 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cu +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cu * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./cudnn_batch_norm-inl.h" @@ -30,10 +30,56 @@ namespace mxnet { namespace op { #if CUDNN_MAJOR == 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { - return new CuDNNBatchNormOp(param); + +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { + static thread_local CuDNNBatchNormOp op; + op.Init(param); + return op; +} + +static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); +#endif +} + +static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); +#endif } + +NNVM_REGISTER_OP(CuDNNBatchNorm) +.set_attr("FCompute", BatchNormCompute_CuDNNv4); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_attr("FCompute", BatchNormGradCompute_CuDNNv4); + #endif // CUDNN_MAJOR == 4 } // namespace op } // namespace mxnet From f3ca6ff1ef0f93fb103ba5771b2b024d14068536 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 21:59:15 +0000 Subject: [PATCH 012/264] Use NNVM interface for CuDNN pooling. --- src/operator/nn/cudnn/cudnn_pooling-inl.h | 283 ++++++++++------------ 1 file changed, 127 insertions(+), 156 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h index 104ed8546dca..8442b37058d4 100644 --- a/src/operator/nn/cudnn/cudnn_pooling-inl.h +++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h @@ -34,13 +34,18 @@ namespace mxnet { namespace op { template -class CuDNNPoolingOp : public Operator { +class CuDNNPoolingOp { public: - explicit CuDNNPoolingOp(PoolingParam p) { - param_ = p; - init_cudnn_ = false; + CuDNNPoolingOp() { // TODO(xxx): fp16 dtype_ = mshadow::DataType::kCudnnFlag; + CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + } + + void Init(const PoolingParam &p) { + param_ = p; switch (param_.pool_type) { case pool_enum::kMaxPooling: mode_ = CUDNN_POOLING_MAX; @@ -54,33 +59,24 @@ class CuDNNPoolingOp : public Operator { } ~CuDNNPoolingOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -93,11 +89,8 @@ class CuDNNPoolingOp : public Operator { out.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -113,31 +106,23 @@ class CuDNNPoolingOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -152,10 +137,10 @@ class CuDNNPoolingOp : public Operator { m_in_grad.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -174,129 +159,115 @@ class CuDNNPoolingOp : public Operator { } private: - inline void Init(mshadow::Stream *s, - const std::vector &in_data, - const std::vector &out_data) { + inline void Init(mshadow::Stream *s, const TBlob &in_data, + const TBlob &out_data) { using namespace mshadow; #if CUDNN_MAJOR >= 5 nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; #endif - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - if (!init_cudnn_) { - init_cudnn_ = true; - if (param_.kernel.ndim() == 2) { - // 2d conv - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - mshadow::Shape<4> dshape = data.shape_; - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - out.shape_[0], - out.shape_[1], - out.shape_[2], - out.shape_[3])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - nan_prop_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 :param_.stride[1])); - #else - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - #endif - } else { - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - std::vector ishape = {static_cast(data.shape_[0]), - static_cast(data.shape_[1]), - static_cast(data.shape_[2]), - static_cast(data.shape_[3]), - static_cast(data.shape_[4])}; + if (param_.kernel.ndim() == 2) { + // 2d conv + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + mshadow::Shape<4> dshape = data.shape_; + CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + out.shape_[0], + out.shape_[1], + out.shape_[2], + out.shape_[3])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + nan_prop_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 :param_.stride[1])); + #else + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + #endif + } else { + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + std::vector ishape = {static_cast(data.shape_[0]), + static_cast(data.shape_[1]), + static_cast(data.shape_[2]), + static_cast(data.shape_[3]), + static_cast(data.shape_[4])}; - std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[3] * ishape[4]), - static_cast(ishape[4]), - 1}; + std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[3] * ishape[4]), + static_cast(ishape[4]), 1}; - std::vector oshape = {static_cast(out.shape_[0]), - static_cast(out.shape_[1]), - static_cast(out.shape_[2]), - static_cast(out.shape_[3]), - static_cast(out.shape_[4])}; + std::vector oshape = {static_cast(out.shape_[0]), + static_cast(out.shape_[1]), + static_cast(out.shape_[2]), + static_cast(out.shape_[3]), + static_cast(out.shape_[4])}; - std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[3] * oshape[4]), - static_cast(oshape[4]), - 1}; + std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[3] * oshape[4]), + static_cast(oshape[4]), 1}; - std::vector kernel_vec = {param_.global_pool ? ishape[2] : - static_cast(param_.kernel[0]), - param_.global_pool ? ishape[3] : - static_cast(param_.kernel[1]), - param_.global_pool ? ishape[4] : - static_cast(param_.kernel[2])}; + std::vector kernel_vec = {param_.global_pool ? ishape[2] : + static_cast(param_.kernel[0]), + param_.global_pool ? ishape[3] : + static_cast(param_.kernel[1]), + param_.global_pool ? ishape[4] : + static_cast(param_.kernel[2])}; - std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), - param_.global_pool ? 0 : static_cast(param_.pad[1]), - param_.global_pool ? 0 : static_cast(param_.pad[2])}; + std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), + param_.global_pool ? 0 : static_cast(param_.pad[1]), + param_.global_pool ? 0 : static_cast(param_.pad[2])}; - std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), - param_.global_pool ? 1 : static_cast(param_.stride[1]), - param_.global_pool ? 1 : static_cast(param_.stride[2])}; + std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), + param_.global_pool ? 1 : static_cast(param_.stride[1]), + param_.global_pool ? 1 : static_cast(param_.stride[2])}; - CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, - dtype_, - static_cast(ishape.size()), - &ishape[0], - &istride[0])); - CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, - dtype_, - static_cast(oshape.size()), - &oshape[0], - &ostride[0])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, - mode_, - nan_prop_, - static_cast(kernel_vec.size()), - &(kernel_vec[0]), - &(pad_vec[0]), - &(stride_vec[0]))); - #else - LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; - #endif - } + CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, + dtype_, + static_cast(ishape.size()), + &ishape[0], + &istride[0])); + CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, + dtype_, + static_cast(oshape.size()), + &oshape[0], + &ostride[0])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, + mode_, + nan_prop_, + static_cast(kernel_vec.size()), + &(kernel_vec[0]), + &(pad_vec[0]), + &(stride_vec[0]))); + #else + LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; + #endif } } - bool init_cudnn_; + cudnnDataType_t dtype_; cudnnHandle_t handle_; cudnnPoolingMode_t mode_; From e9387941ef7738f09631104ad3b7825764679ce8 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:00:23 +0000 Subject: [PATCH 013/264] Use NNVM interface for CuDNN softmax activation. --- .../nn/cudnn/cudnn_softmax_activation-inl.h | 102 +++++++----------- 1 file changed, 41 insertions(+), 61 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h index 5afdb4844364..239da023668d 100644 --- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h @@ -32,73 +32,64 @@ namespace mxnet { namespace op { -class CuDNNSoftmaxActivationOp : public Operator { +class CuDNNSoftmaxActivationOp { public: - explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) { - this->param_ = param; - init_cudnn_ = false; + CuDNNSoftmaxActivationOp() { dtype_ = CUDNN_DATA_FLOAT; + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + } + + void Init(SoftmaxActivationParam param) { + this->param_ = param; } ~CuDNNSoftmaxActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_data.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0], - in_data[softmax_activation::kData].shape_[1], 1, 1); - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_data.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_data[softmax_activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[softmax_activation::kData].ndim()) { - dshape[i] = in_data[softmax_activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } float alpha = 1.0f; float beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_, CUDNN_SOFTMAX_ACCURATE, softmax_mode, @@ -110,19 +101,10 @@ class CuDNNSoftmaxActivationOp : public Operator { out.dptr_)); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); float alpha = 1.0f; float beta = 0.0f; Stream *s = ctx.get_stream(); @@ -132,31 +114,30 @@ class CuDNNSoftmaxActivationOp : public Operator { Tensor input_grad; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_grad.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0], - in_grad[softmax_activation::kData].shape_[1], 1, 1); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_grad.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_grad[softmax_activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[softmax_activation::kData].ndim()) { - dshape[i] = in_grad[softmax_activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); @@ -174,7 +155,6 @@ class CuDNNSoftmaxActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnTensorDescriptor_t shape_desc_; SoftmaxActivationParam param_; From b14062188e799efcf06eac71566beaa144b4255f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:01:41 +0000 Subject: [PATCH 014/264] Use NNVM interface for CuDNN activation. --- src/operator/nn/cudnn/cudnn_activation-inl.h | 125 +++++++++---------- 1 file changed, 58 insertions(+), 67 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h index 888528309cdf..35827917c7d5 100644 --- a/src/operator/nn/cudnn/cudnn_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_activation-inl.h @@ -33,12 +33,18 @@ namespace mxnet { namespace op { template -class CuDNNActivationOp : public Operator { +class CuDNNActivationOp { public: - explicit CuDNNActivationOp(ActivationParam param) { - param_ = param; - init_cudnn_ = false; + CuDNNActivationOp() { dtype_ = mshadow::DataType::kCudnnFlag; + #if CUDNN_MAJOR >= 5 + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); + #endif + } + + void Init(const ActivationParam ¶m) { + param_ = param; switch (param_.act_type) { case activation::kReLU: mode_ = CUDNN_ACTIVATION_RELU; @@ -54,67 +60,55 @@ class CuDNNActivationOp : public Operator { break; } #if CUDNN_MAJOR >= 5 - nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; - CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_)); #endif + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); } ~CuDNNActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); - #endif - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); + #endif } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; - if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + if (in_data.ndim() == 2) { + Shape<4> dshape = Shape4(in_data.shape_[0], + in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_data[activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[activation::kData].ndim()) { - dshape[i] = in_data[activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationForward(s->dnn_handle_, mode_, @@ -136,20 +130,11 @@ class CuDNNActivationOp : public Operator { #endif } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; Stream *s = ctx.get_stream(); @@ -157,31 +142,38 @@ class CuDNNActivationOp : public Operator { Tensor data; Tensor output_data; Tensor input_grad; - if (in_grad[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0], - in_grad[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + if (in_grad.ndim() == 2) { + Shape<4> dshape = Shape4(in_grad.shape_[0], + in_grad.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_grad[activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[activation::kData].ndim()) { - dshape[i] = in_grad[activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_, mode_, @@ -212,7 +204,6 @@ class CuDNNActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnActivationMode_t mode_; cudnnTensorDescriptor_t shape_desc_; From 73383cf529fdfb8d0c18ca9b08bc8df54754a85a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:07:30 +0000 Subject: [PATCH 015/264] Use NNVM interface for CuDNN convolution. --- src/operator/nn/cudnn/cudnn_convolution-inl.h | 60 ++++++++----------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index f37203998e0a..fe10c42aaea7 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -42,9 +42,19 @@ namespace op { * \brief The Operator used to perform convolution using cuDNN kernels. */ template -class CuDNNConvolutionOp : public Operator { +class CuDNNConvolutionOp { public: - explicit CuDNNConvolutionOp(const ConvolutionParam& param, + CuDNNConvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -57,8 +67,6 @@ class CuDNNConvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -92,22 +100,19 @@ class CuDNNConvolutionOp : public Operator { } ~CuDNNConvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; DType *data_ptr = NULL; @@ -183,13 +188,11 @@ class CuDNNConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; @@ -199,7 +202,8 @@ class CuDNNConvolutionOp : public Operator { DType *data_ptr = NULL; DType *gdata_ptr = NULL; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); if (param_.kernel.ndim() == 2) { Tensor grad = out_grad[conv::kOut].get(s); @@ -224,6 +228,7 @@ class CuDNNConvolutionOp : public Operator { data_ptr = data.dptr_; gdata_ptr = gdata.dptr_; } + GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -361,13 +366,6 @@ class CuDNNConvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[conv::kData]; TShape wshape = in_shape[conv::kWeight]; @@ -573,7 +571,6 @@ class CuDNNConvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -817,7 +814,6 @@ class CuDNNConvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_size = 0, back_size_w = 0; CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, @@ -842,8 +838,6 @@ class CuDNNConvolutionOp : public Operator { out_desc_, forward_algo_.AlgoNumber(), &forward_workspace_byte_)); - - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -876,8 +870,6 @@ class CuDNNConvolutionOp : public Operator { std::vector param_dilate_; std::vector param_pad_; - bool init_cudnn_; - bool init_temp_size_; // Temp workspace size in bytes needed for Forward() operation. size_t forward_workspace_byte_; // Temp workspace size in bytes needed for Backward() operation. From b1876e68a05adb2131cb7369090868e1071f1e03 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:08:38 +0000 Subject: [PATCH 016/264] Use NNVM interface for CuDNN deconvolution. --- .../nn/cudnn/cudnn_deconvolution-inl.h | 63 +++++++++---------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 09e89c27bbaf..7d309e09d589 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -39,9 +39,19 @@ namespace op { #if MXNET_USE_CUDNN == 1 template -class CuDNNDeconvolutionOp : public Operator { +class CuDNNDeconvolutionOp { public: - explicit CuDNNDeconvolutionOp(DeconvolutionParam param, + CuDNNDeconvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(DeconvolutionParam param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -54,8 +64,6 @@ class CuDNNDeconvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -89,22 +97,19 @@ class CuDNNDeconvolutionOp : public Operator { } ~CuDNNDeconvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; DType *data_ptr = NULL; @@ -197,13 +202,11 @@ class CuDNNDeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; @@ -213,7 +216,8 @@ class CuDNNDeconvolutionOp : public Operator { DType *data_ptr = NULL; DType *gdata_ptr = NULL; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); if (param_.kernel.ndim() == 2) { Tensor grad = out_grad[deconv::kOut].get(s); @@ -380,13 +384,6 @@ class CuDNNDeconvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[deconv::kData]; TShape wshape = in_shape[deconv::kWeight]; @@ -591,7 +588,6 @@ class CuDNNDeconvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -844,7 +840,6 @@ class CuDNNDeconvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_data_algo_workspace_size = 0; size_t back_filter_algo_workspace_size = 0; @@ -874,7 +869,6 @@ class CuDNNDeconvolutionOp : public Operator { forward_workspace_byte_ = back_data_algo_workspace_size; backward_workspace_byte_ = std::max(forward_algo_workspace_size, back_filter_algo_workspace_size); - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -905,8 +899,11 @@ class CuDNNDeconvolutionOp : public Operator { std::vector param_stride_; std::vector param_dilate_; - bool init_cudnn_; - bool init_temp_size_; + int forward_compute_type_; + int backward_compute_type_; + const std::vector in_shapes_; + const std::vector out_shapes_; + // Temp workspace size in bytes needed for Forward() operation. Note that // in deconvolution, this is handled by the cuDNN backprop-to-data kernel. size_t forward_workspace_byte_; From 05ebaefefb81ddd7b0f12c43b512eeee0f8f9a7e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:22:17 +0000 Subject: [PATCH 017/264] Fix a compilation error in convolution. --- src/operator/nn/convolution.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index bca8adcba2a0..2bd50b15a395 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -400,12 +400,15 @@ There are other options to tune the performance. .set_attr("FInferShape", ConvolutionShape) .set_attr("FInferType", ConvolutionType) .set_attr("FInferStorageType", [](const nnvm::NodeAttrs& attrs, - const Context& ctx, std::vector *in_attrs, std::vector *out_attrs) { + const int dev_mask, DispatchMode* dispatch_mode, + std::vector *in_attrs, std::vector *out_attrs) { const ConvolutionParam& params = nnvm::get(attrs.parsed); if (params.no_bias) - return ElemwiseStorageType<2, 1>(attrs, ctx, in_attrs, out_attrs); + return ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); else - return ElemwiseStorageType<3, 1>(attrs, ctx, in_attrs, out_attrs); + return ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); }) .set_attr("FCompute", ConvolutionCompute) .set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) From ce569f6217427e4d3247551d1a4294b8b1da1b46 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:22:29 +0000 Subject: [PATCH 018/264] Fix a compilation error in activation. --- src/operator/nn/activation-inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index c46c33d6dfda..5e701a205e1e 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -68,9 +68,9 @@ void ActivationForward(const OpContext &ctx, const TBlob &in_data, using namespace mshadow; using namespace mshadow::expr; Stream *s = ctx.get_stream(); - const size_t sz = input.shape_.Size(); + const size_t sz = in_data.shape_.Size(); if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { + MXNET_ASSIGN_REQ_SWITCH(req, Req, { mxnet_op::Kernel, xpu>::Launch( s, sz, out_data.dptr(), @@ -88,7 +88,7 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, Stream *s = ctx.get_stream(); const size_t sz = out_data.shape_.Size(); if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { + MXNET_ASSIGN_REQ_SWITCH(req, Req, { mxnet_op::Kernel, Req>, xpu>::Launch( s, sz, From d7dcd1fb50bc83a7f6cdbc4266017adf7539b6fe Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:45:42 +0000 Subject: [PATCH 019/264] Fix coding style. --- src/operator/nn/activation-inl.h | 16 +++++----- src/operator/nn/batch_norm-inl.h | 32 ++++++++++---------- src/operator/nn/batch_norm.cc | 5 ++-- src/operator/nn/convolution-inl.h | 26 ++++++++-------- src/operator/nn/convolution.cc | 5 ++-- src/operator/nn/deconvolution-inl.h | 38 ++++++++++++------------ src/operator/nn/deconvolution.cc | 5 ++-- src/operator/nn/deconvolution.cu | 22 ++++++++------ src/operator/nn/dropout-inl.h | 21 ++++++------- src/operator/nn/fully_connected-inl.h | 22 +++++++------- src/operator/nn/fully_connected.cc | 5 ++-- src/operator/nn/fully_connected.cu | 16 +++++----- src/operator/nn/pooling-inl.h | 22 +++++++------- src/operator/nn/pooling.cc | 2 +- src/operator/nn/pooling.cu | 16 +++++----- src/operator/nn/softmax_activation-inl.h | 20 ++++++------- src/operator/nn/softmax_activation.cu | 16 +++++----- src/operator/nn/upsampling-inl.h | 22 +++++++------- src/operator/nn/upsampling.cc | 4 +-- 19 files changed, 162 insertions(+), 153 deletions(-) diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index 5e701a205e1e..d8da30b7263a 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -101,10 +101,10 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, template void ActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); @@ -134,10 +134,10 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs, template void ActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { #if MXNET_USE_CUDNN == 1 CHECK_EQ(inputs.size(), 3U); #else diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index b229290dd3a8..4838570bda97 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -110,10 +110,10 @@ class BatchNormOp { * \sa OpReqType, OpContext */ void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; @@ -160,12 +160,12 @@ class BatchNormOp { * \sa OperatorProperty, OpReqType, OpContext */ void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U); CHECK_EQ(in_data.size(), 3U); CHECK_EQ(out_data.size(), 3U); @@ -222,9 +222,9 @@ static BatchNormOp &GetBatchNormOp(const BatchNormParam& pa template void BatchNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const BatchNormParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 5U); std::vector in_data(inputs.begin(), inputs.begin() + 3); @@ -237,9 +237,9 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, template void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 11U); const BatchNormParam& param = nnvm::get(attrs.parsed); std::vector out_grad(inputs.begin(), diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 9ce4febd3eef..298de204a53f 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -318,7 +318,8 @@ void BatchNormOp::DoBackward(mshadow::Stream *, DMLC_REGISTER_PARAMETER(BatchNormParam); static bool BatchNormShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, + std::vector *out_shape) { const BatchNormParam& param = nnvm::get(attrs.parsed); using namespace mshadow; CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; @@ -357,7 +358,7 @@ static inline std::vector ListOutputs() { } static bool BatchNormType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { + std::vector *in_type, std::vector *out_type) { using namespace mshadow; CHECK_GE(in_type->size(), 1U); const int dtype = (*in_type)[0]; diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 4af16f0aa231..84babed946b3 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -161,9 +161,9 @@ class ConvolutionOp { } void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(req[conv::kOut], kWriteTo); @@ -233,10 +233,10 @@ class ConvolutionOp { } void Backward(const OpContext &ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& req, - const std::vector& in_grad) { + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& req, + const std::vector& in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); @@ -387,9 +387,9 @@ class ConvolutionOp { template void ConvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { static thread_local ConvolutionOp op; @@ -400,9 +400,9 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, template void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); std::vector in_data(inputs.begin() + 1, inputs.end()); const TBlob &out_grad = inputs[0]; diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 2bd50b15a395..9cc0914e1f01 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -52,7 +52,8 @@ static inline std::vector ListArguments(const ConvolutionParam& par } static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, + std::vector *out_shape) { using namespace mshadow; const ConvolutionParam& param_ = nnvm::get(attrs.parsed); if (!param_.no_bias) { @@ -241,7 +242,7 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, } static bool ConvolutionType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { + std::vector *in_type, std::vector *out_type) { const ConvolutionParam& param_ = nnvm::get(attrs.parsed); CHECK_GE(in_type->size(), 1U); int dtype = (*in_type)[0]; diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 42ab9cb1aba9..d97eb0ab4304 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -204,9 +204,9 @@ class DeconvolutionOp { } void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; @@ -311,10 +311,10 @@ class DeconvolutionOp { } void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &req, - const std::vector &in_grad) { + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful @@ -455,9 +455,9 @@ class DeconvolutionOp { template void _DeconvolutionCompute(const DeconvolutionParam& param, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { static thread_local DeconvolutionOp op; op.Init(param); @@ -467,18 +467,18 @@ void _DeconvolutionCompute(const DeconvolutionParam& param, template void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); _DeconvolutionCompute(param, ctx, inputs, req, outputs); } template void _DeconvolutionGradCompute(const DeconvolutionParam& param, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { std::vector in_data(inputs.begin() + 1, inputs.end()); const TBlob &out_grad = inputs[0]; const std::vector &in_grad = outputs; @@ -493,9 +493,9 @@ void _DeconvolutionGradCompute(const DeconvolutionParam& param, template void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); _DeconvolutionGradCompute(param, ctx, inputs, req, outputs); } diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index eb958154baa7..3dd3f9f013a0 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -30,7 +30,8 @@ namespace mxnet { namespace op { static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, + std::vector *out_shape) { const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); #if MXNET_USE_CUDNN == 0 if (param_.kernel.ndim() != 2) { @@ -236,7 +237,7 @@ static inline std::vector ListArguments(const DeconvolutionParam& p } static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { + std::vector *in_type, std::vector *out_type) { const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); CHECK_GE(in_type->size(), 1U); int dtype = (*in_type)[0]; diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 0c2e160cf696..2812e4f46e12 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -41,9 +41,11 @@ static DeconvolutionOp &get_op(const DeconvolutionParam& param) { template static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param, - int forward_compute_type, int backward_compute_type, - const std::vector& in_shape, const std::vector& out_shape, - const Context& ctx, bool backward) { + int forward_compute_type, + int backward_compute_type, + const std::vector& in_shape, + const std::vector& out_shape, + const Context& ctx, bool backward) { // Convolution forward has to be called before backward for this operator. // So we can't make this operator thread local. backward might be called // in another thread. @@ -55,9 +57,10 @@ static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param template<> void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); int dtype = inputs[0].type_flag_; // If 1D deconvolution, use MXNet implementation @@ -98,9 +101,10 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, template<> void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); std::vector in_data(inputs.begin() + 1, inputs.end()); const TBlob &out_grad = inputs[0]; diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 4a9228f9a14e..222b0694824c 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -101,7 +101,7 @@ class DropoutOp { } void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, const std::vector &out_data) { + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), 1U); @@ -136,7 +136,8 @@ class DropoutOp { } void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data_mask, const OpReqType &req, const TBlob &in_grad) { + const TBlob &out_data_mask, const OpReqType &req, + const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; Stream *s = ctx.get_stream(); @@ -169,10 +170,10 @@ class DropoutOp { template void DropoutCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DropoutParam& param = nnvm::get(attrs.parsed); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { static thread_local DropoutOp op; @@ -183,10 +184,10 @@ void DropoutCompute(const nnvm::NodeAttrs& attrs, template void DropoutGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const DropoutParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1); diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 07965c354930..73312c7dec68 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -73,7 +73,7 @@ class FullyConnectedOp { } void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, const std::vector &out_data) { + const std::vector &req, const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; if (req[fullc::kOut] == kNullOp) return; @@ -113,8 +113,8 @@ class FullyConnectedOp { } void Backward(const OpContext &ctx, const std::vector &out_grad, - const std::vector &in_data, const std::vector &req, - const std::vector &in_grad) { + const std::vector &in_data, const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful @@ -175,10 +175,10 @@ class FullyConnectedOp { template void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 2 : 3; CHECK_EQ(inputs.size(), in_expected); @@ -205,10 +205,10 @@ void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, template void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; CHECK_EQ(inputs.size(), 3U); diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 6524fbe349f9..cc475e04dd44 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -31,7 +31,8 @@ namespace mxnet { namespace op { static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, + std::vector *out_shape) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); using namespace mshadow; if (!param.no_bias) { @@ -71,7 +72,7 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, } static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { + std::vector *in_type, std::vector *out_type) { CHECK_GE(in_type->size(), 1U); return ElemwiseAttr( attrs, in_type, out_type, -1); diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu index 81bc1a75aa58..7637865f2472 100644 --- a/src/operator/nn/fully_connected.cu +++ b/src/operator/nn/fully_connected.cu @@ -28,10 +28,10 @@ namespace op { template<> void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 2 : 3; CHECK_EQ(inputs.size(), in_expected); @@ -46,10 +46,10 @@ void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, template<> void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; CHECK_EQ(inputs.size(), 3U); diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index 015b83c4fbc6..d635d81f9108 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -88,7 +88,7 @@ class PoolingOp { } void Forward(const OpContext& ctx, const TBlob& in_data, - const OpReqType& req, const TBlob& out_data) { + const OpReqType& req, const TBlob& out_data) { using namespace mshadow; Stream *s = ctx.get_stream(); const TShape& ishape = in_data.shape_; @@ -103,8 +103,8 @@ class PoolingOp { } void Backward(const OpContext& ctx, const TBlob& out_grad, - const TBlob& in_data, const TBlob& out_data, - const OpReqType& req, const TBlob& in_grad) { + const TBlob& in_data, const TBlob& out_data, + const OpReqType& req, const TBlob& in_grad) { using namespace mshadow; Stream *s = ctx.get_stream(); const TShape& ishape = in_data.shape_; @@ -132,10 +132,10 @@ PoolingOp &GetPoolingOp(const PoolingParam ¶m) { template void PoolingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const PoolingParam& param = nnvm::get(attrs.parsed); @@ -152,10 +152,10 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, template void PoolingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 3U); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 3c30e1924323..41ace3cecae0 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -60,7 +60,7 @@ static void PoolingParamParser(nnvm::NodeAttrs* attrs) { } static bool PoolingShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, std::vector *out_shape) { const PoolingParam& param_ = nnvm::get(attrs.parsed); CHECK_EQ(in_shape->size(), 1U); const TShape &dshape = (*in_shape)[0]; diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index 4d5c68f7ca6b..24aa4178b3c7 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -43,10 +43,10 @@ static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { template<> void PoolingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const PoolingParam& param = nnvm::get(attrs.parsed); @@ -80,10 +80,10 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, template<> void PoolingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 3U); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h index 5d0e937e218d..5b91b6f79e98 100644 --- a/src/operator/nn/softmax_activation-inl.h +++ b/src/operator/nn/softmax_activation-inl.h @@ -73,7 +73,7 @@ class SoftmaxActivationOp { } void Forward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; Stream *s = ctx.get_stream(); @@ -94,7 +94,7 @@ class SoftmaxActivationOp { } void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; // Use 3d tensor for both mode -> {instance, channel}. Get shapes @@ -126,10 +126,10 @@ class SoftmaxActivationOp { template void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); @@ -141,10 +141,10 @@ void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, template void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1); diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index 9aba20ece514..8e6e787f8072 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -34,10 +34,10 @@ namespace op { template<> void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); @@ -55,10 +55,10 @@ void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, template<> void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1); diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h index 91254dad9046..6ce33fcca8cb 100644 --- a/src/operator/nn/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -90,9 +90,9 @@ class UpSamplingNearestOp { } void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_data.size(), static_cast(param_.num_args)); @@ -126,8 +126,8 @@ class UpSamplingNearestOp { } void Backward(const OpContext &ctx, const TBlob &out_grad, - const std::vector &req, - const std::vector &in_grad) { + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); @@ -198,9 +198,9 @@ static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& pa template void UpSamplingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const UpSamplingParam& param = nnvm::get(attrs.parsed); if (param.sample_type == up_enum::kNearest) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { @@ -218,9 +218,9 @@ void UpSamplingCompute(const nnvm::NodeAttrs& attrs, template void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { const UpSamplingParam& param = nnvm::get(attrs.parsed); if (param.sample_type == up_enum::kNearest) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc index 87316a939718..44b619ac9516 100644 --- a/src/operator/nn/upsampling.cc +++ b/src/operator/nn/upsampling.cc @@ -32,7 +32,7 @@ namespace mxnet { namespace op { static bool UpSamplingShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { + std::vector *in_shape, std::vector *out_shape) { const UpSamplingParam& param_ = nnvm::get(attrs.parsed); CHECK_GE(in_shape->size(), 1U); const TShape &dshape = (*in_shape)[0]; @@ -87,7 +87,7 @@ static inline std::vector ListArguments(const UpSamplingParam& para } static bool UpSamplingType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { + std::vector *in_type, std::vector *out_type) { const UpSamplingParam& param = nnvm::get(attrs.parsed); CHECK_GE(in_type->size(), 1U); int dtype = (*in_type)[0]; From 538a69b6200536f98a86ed192ab4ef0a162c369a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 22:50:31 +0000 Subject: [PATCH 020/264] Fix coding style for make lint. --- src/operator/nn/convolution.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 9cc0914e1f01..2c010674314d 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -401,7 +401,7 @@ There are other options to tune the performance. .set_attr("FInferShape", ConvolutionShape) .set_attr("FInferType", ConvolutionType) .set_attr("FInferStorageType", [](const nnvm::NodeAttrs& attrs, - const int dev_mask, DispatchMode* dispatch_mode, + const int dev_mask, DispatchMode* dispatch_mode, std::vector *in_attrs, std::vector *out_attrs) { const ConvolutionParam& params = nnvm::get(attrs.parsed); if (params.no_bias) From 593d170f048bea8feecfdff4e1bf85a9b6f5abc8 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 23:02:21 +0000 Subject: [PATCH 021/264] use enums in batch norm. --- src/operator/nn/batch_norm-inl.h | 21 ++++++++++++++------- src/operator/nn/batch_norm.cc | 10 +++++----- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 4838570bda97..1220156f1056 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -227,8 +227,10 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { const BatchNormParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 5U); - std::vector in_data(inputs.begin(), inputs.begin() + 3); - std::vector aux_states(inputs.begin() + 3, inputs.end()); + std::vector in_data(inputs.begin(), + inputs.begin() + (int) batchnorm::kInMovingMean); + std::vector aux_states(inputs.begin() + (int) batchnorm::kInMovingMean, + inputs.end()); MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); @@ -242,11 +244,16 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { CHECK_EQ(inputs.size(), 11U); const BatchNormParam& param = nnvm::get(attrs.parsed); - std::vector out_grad(inputs.begin(), - inputs.begin() + (param.output_mean_var ? 3U : 1U)); - std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); - std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); - std::vector out_data(inputs.begin() + 8, inputs.end()); + int num_out_grads = param.output_mean_var ? 3U : 1U; + int in_data_start = 3; + int aux_states_start = in_data_start + (int) batchnorm::kInMovingMean; + int out_data_start = in_data_start + (int) batchnorm::kInMovingVar + 1; + std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); + std::vector in_data(inputs.begin() + in_data_start, + inputs.begin() + aux_states_start); + std::vector aux_states(inputs.begin() + aux_states_start, + inputs.begin() + out_data_start); + std::vector out_data(inputs.begin() + out_data_start, inputs.end()); std::vector in_grad(outputs.begin(), outputs.begin() + 3); MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 298de204a53f..bbf4da9874c4 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -323,7 +323,7 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs, const BatchNormParam& param = nnvm::get(attrs.parsed); using namespace mshadow; CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; - const TShape &dshape = in_shape->at(0); + const TShape &dshape = in_shape->at(batchnorm::kData); const size_t channelAxis = static_cast(param.axis < 0 ? static_cast(dshape.ndim()) + param.axis @@ -336,10 +336,10 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs, return false; } - in_shape->at(1) = TShape(Shape1(channelCount)); - in_shape->at(2) = TShape(Shape1(channelCount)); - in_shape->at(3) = TShape(Shape1(channelCount)); // kMovingMean - in_shape->at(4) = TShape(Shape1(channelCount)); // kMovingVar + in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount)); + in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount)); + in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount)); // kMovingMean + in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount)); // kMovingVar out_shape->clear(); out_shape->push_back(dshape); // kOut From bbb0dbab3a995197e6a91922a86bbd6988ce4a07 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 23:32:31 +0000 Subject: [PATCH 022/264] Use CoreOpRunner for refactored Ops. --- tests/cpp/operator/activation_perf.cc | 4 ++-- tests/cpp/operator/fully_conn_perf.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index e482848705ad..fe51be533510 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -41,7 +41,7 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); - test::op::LegacyOpRunner runner; + test::op::CoreOpRunner runner; runner.RunBidirectional(false, { shape }, kwargs, 1); } @@ -52,7 +52,7 @@ TEST(ACTIVATION_PERF, TimingCPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::op::LegacyOpRunner runner; + test::op::CoreOpRunner runner; runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index c8d8021f6f6e..8c32e51e3161 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -41,7 +41,7 @@ const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} }; TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { TShape shape({5, 5}); kwargs_t kwargs = basic_fullyconn_args; - test::op::LegacyOpRunner runner; + test::op::CoreOpRunner runner; runner.RunBidirectional(false, { shape }, kwargs, 1); } @@ -50,7 +50,7 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; - test::op::LegacyOpRunner runner; + test::op::CoreOpRunner runner; runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache From 1e898a320a8e770d77196a2d4a33564ba2e982a1 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 23:40:55 +0000 Subject: [PATCH 023/264] Make FullyConnected stateless. --- src/operator/nn/fully_connected-inl.h | 198 ++++++++++++-------------- 1 file changed, 88 insertions(+), 110 deletions(-) diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 73312c7dec68..4646d3a5e199 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -61,117 +61,99 @@ struct FullyConnectedParam : public dmlc::Parameter { } }; -/** - * \brief This is the implementation of fully connected operator. - * \tparam xpu The device that the op will be executed on. - */ template -class FullyConnectedOp { - public: - void Init(const FullyConnectedParam &p) { - this->param_ = p; - } - - void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - // TODO(bing): judge shape to remove flatten op - Stream *s = ctx.get_stream(); +void FCForward(const OpContext &ctx, const FullyConnectedParam ¶m, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + if (req[fullc::kOut] == kNullOp) return; + CHECK_EQ(req[fullc::kOut], kWriteTo); + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + // TODO(bing): judge shape to remove flatten op + Stream *s = ctx.get_stream(); #if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; #endif // __CUDACC__ - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, out; - if (!param_.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - } - - // Legacy approach shown here for comparison: - // out = dot(data, wmat.T()); - linalg_gemm(data, wmat, out, false, true, s); - if (!param_.no_bias) { - Tensor bias = in_data[fullc::kBias].get(s); - out += repmat(bias, data.size(0)); - } + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_data[fullc::kOut].shape_; + + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, out; + if (!param.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); } - void Backward(const OpContext &ctx, const std::vector &out_grad, - const std::vector &in_data, const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_grad[fullc::kOut].shape_; - - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, grad, gdata; - if (!param_.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - } - -#if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; -#endif - // backprop - CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; - // gradient of weight - Tensor gwmat = in_grad[fullc::kWeight].get(s); - // Legacy approach shown here for comparison: - // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); - linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); - // gradient of bias - if (!param_.no_bias) { - Tensor gbias = in_grad[fullc::kBias].get(s); - Assign(gbias, req[fullc::kBias], sum_rows(grad)); - } - // gradient of data - // Legacy approach shown here for comparison: - // Assign(gdata, req[fullc::kData], dot(grad, wmat)); - linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); + // Legacy approach shown here for comparison: + // out = dot(data, wmat.T()); + linalg_gemm(data, wmat, out, false, true, s); + if (!param.no_bias) { + Tensor bias = in_data[fullc::kBias].get(s); + out += repmat(bias, data.size(0)); } +} - static FullyConnectedOp &get_op(const FullyConnectedParam& param) { - static thread_local FullyConnectedOp op; - op.Init(param); - return op; +template +void FCBackward(const OpContext &ctx, const FullyConnectedParam ¶m, + const std::vector &out_grad, const std::vector &in_data, + const std::vector &req, const std::vector &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_grad[fullc::kOut].shape_; + + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, grad, gdata; + if (!param.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); } - private: - FullyConnectedParam param_; -}; // class FullyConnectedOp +#if defined(__CUDACC__) + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; +#endif + // backprop + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + // gradient of weight + Tensor gwmat = in_grad[fullc::kWeight].get(s); + // Legacy approach shown here for comparison: + // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); + linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); + // gradient of bias + if (!param.no_bias) { + Tensor gbias = in_grad[fullc::kBias].get(s); + Assign(gbias, req[fullc::kBias], sum_rows(grad)); + } + // gradient of data + // Legacy approach shown here for comparison: + // Assign(gdata, req[fullc::kData], dot(grad, wmat)); + linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); +} template void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, @@ -187,12 +169,10 @@ void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, switch (dtype) { case mshadow::kFloat32: - FullyConnectedOp::get_op(param).Forward(ctx, inputs, - req, outputs); + FCForward(ctx, param, inputs, req, outputs); break; case mshadow::kFloat64: - FullyConnectedOp::get_op(param).Forward(ctx, inputs, - req, outputs); + FCForward(ctx, param, inputs, req, outputs); break; case mshadow::kFloat16: LOG(FATAL) << "float16 fully connected layer is currently" @@ -221,12 +201,10 @@ void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, switch (dtype) { case mshadow::kFloat32: - FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, - req, outputs); + FCBackward(ctx, param, out_grad, in_data, req, outputs); break; case mshadow::kFloat64: - FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, - req, outputs); + FCBackward(ctx, param, out_grad, in_data, req, outputs); break; case mshadow::kFloat16: LOG(FATAL) << "float16 fully connected layer is currently" From 9854b4bc1d60d92fa37c04988c61e6a262373705 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 00:00:55 +0000 Subject: [PATCH 024/264] Make upsampling stateless. --- src/operator/nn/upsampling-inl.h | 163 ++++++++++++++----------------- 1 file changed, 75 insertions(+), 88 deletions(-) diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h index 6ce33fcca8cb..4b9159edd174 100644 --- a/src/operator/nn/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -83,98 +83,89 @@ struct UpSamplingParam : public dmlc::Parameter { }; // struct UpSamplingParam template -class UpSamplingNearestOp { - public: - void Init(UpSamplingParam p) { - this->param_ = p; +void UpSamplingForward(const OpContext &ctx, const UpSamplingParam ¶m, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), static_cast(param.num_args)); + CHECK_EQ(out_data.size(), 1U); + if (req[up_enum::kOut] == kNullOp) { + return; } - - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), static_cast(param_.num_args)); - CHECK_EQ(out_data.size(), 1U); - if (req[up_enum::kOut] == kNullOp) { - return; - } - Stream *s = ctx.get_stream(); - Tensor out = out_data[up_enum::kOut].get(s); - if (param_.num_args > 1) { - int begin = 0; - for (int i = 0; i < param_.num_args; ++i) { - Tensor data = in_data[i].get(s); - int end = begin + data.size(1); - int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); - if (param_.multi_input_mode == up_enum::kSum) { - if (i == 0) { - Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); - } else { - out += upsampling_nearest(data, scale); - } + Stream *s = ctx.get_stream(); + Tensor out = out_data[up_enum::kOut].get(s); + if (param.num_args > 1) { + int begin = 0; + for (int i = 0; i < param.num_args; ++i) { + Tensor data = in_data[i].get(s); + int end = begin + data.size(1); + int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); + if (param.multi_input_mode == up_enum::kSum) { + if (i == 0) { + Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); } else { - Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); + out += upsampling_nearest(data, scale); } - begin = end; + } else { + Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); } - } else { - Tensor data = in_data[up_enum::kData].get(s); - Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale)); + begin = end; } + } else { + Tensor data = in_data[up_enum::kData].get(s); + Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale)); } +} - void Backward(const OpContext &ctx, const TBlob &out_grad, - const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); - Stream *s = ctx.get_stream(); - Tensor grad = out_grad.get(s); - if (param_.num_args > 1) { - int begin = 0; - for (int i = 0; i < param_.num_args; ++i) { - Tensor input_grad = in_grad[i].get(s); - mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - int end = begin + input_grad.size(1); - int scale = grad.size(2)/in_shape[0]; - if (param_.multi_input_mode == up_enum::kSum) { - Assign(input_grad, req[i], - pool(grad, - in_shape, - scale, - scale, - scale, - scale)); - } else { - Assign(input_grad, req[i], - pool(slice<1>(grad, begin, end), - in_shape, - scale, - scale, - scale, - scale)); - } - begin = end; - } - } else { - Tensor input_grad = in_grad[up_enum::kData].get(s); +template +void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam ¶m, + const TBlob &out_grad, const std::vector &req, + const std::vector &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), static_cast(param.num_args)); + Stream *s = ctx.get_stream(); + Tensor grad = out_grad.get(s); + if (param.num_args > 1) { + int begin = 0; + for (int i = 0; i < param.num_args; ++i) { + Tensor input_grad = in_grad[i].get(s); mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - Assign(input_grad, req[up_enum::kData], - pool(grad, - in_shape, - param_.scale, - param_.scale, - param_.scale, - param_.scale)); + int end = begin + input_grad.size(1); + int scale = grad.size(2)/in_shape[0]; + if (param.multi_input_mode == up_enum::kSum) { + Assign(input_grad, req[i], + pool(grad, + in_shape, + scale, + scale, + scale, + scale)); + } else { + Assign(input_grad, req[i], + pool(slice<1>(grad, begin, end), + in_shape, + scale, + scale, + scale, + scale)); + } + begin = end; } + } else { + Tensor input_grad = in_grad[up_enum::kData].get(s); + mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); + Assign(input_grad, req[up_enum::kData], + pool(grad, + in_shape, + param.scale, + param.scale, + param.scale, + param.scale)); } - - private: - UpSamplingParam param_; -}; // class UpSamplingNearestOp +} static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) { DeconvolutionParam p = DeconvolutionParam(); @@ -204,9 +195,7 @@ void UpSamplingCompute(const nnvm::NodeAttrs& attrs, const UpSamplingParam& param = nnvm::get(attrs.parsed); if (param.sample_type == up_enum::kNearest) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - static thread_local UpSamplingNearestOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); + UpSamplingForward(ctx, param, inputs, req, outputs); }); } else if (param.sample_type == up_enum::kBilinear) { DeconvolutionParam p = GetDeconvolutionParam(param); @@ -225,9 +214,7 @@ void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, if (param.sample_type == up_enum::kNearest) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { CHECK_EQ(inputs.size(), 1U); - static thread_local UpSamplingNearestOp op; - op.Init(param); - op.Backward(ctx, inputs[0], req, outputs); + UpSamplingBackward(ctx, param, inputs[0], req, outputs); }); } else if (param.sample_type == up_enum::kBilinear) { DeconvolutionParam p = GetDeconvolutionParam(param); From 5bb99c8c8335aba596eddab1a416db676389e09b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 00:07:43 +0000 Subject: [PATCH 025/264] Make pooling stateless. --- src/operator/nn/pooling-inl.h | 80 +++++++++++++++-------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index d635d81f9108..3f511dfaacd9 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -81,53 +81,39 @@ struct PoolingParam : public dmlc::Parameter { }; template -class PoolingOp { - public: - void Init(PoolingParam p) { - this->param_ = p; - } - - void Forward(const OpContext& ctx, const TBlob& in_data, - const OpReqType& req, const TBlob& out_data) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data.shape_; - - pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, req, out_data.dptr()); - } - - void Backward(const OpContext& ctx, const TBlob& out_grad, - const TBlob& in_data, const TBlob& out_data, - const OpReqType& req, const TBlob& in_grad) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data.shape_; - - unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), - in_grad.shape_, out_grad.shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, req, in_grad.dptr()); - } - - private: - PoolingParam param_; -}; // class PoolingOp +void PoolingForward(const OpContext& ctx, const PoolingParam ¶m, + const TBlob& in_data, const OpReqType& req, + const TBlob& out_data) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, + param.global_pool? + TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) + : param.kernel, + param.pad, + param.global_pool? TShape(param.kernel.ndim()) : param.stride, + param.pool_type, req, out_data.dptr()); +} template -PoolingOp &GetPoolingOp(const PoolingParam ¶m) { - static thread_local PoolingOp op; - op.Init(param); - return op; +void PoolingBackward(const OpContext& ctx, const PoolingParam ¶m, + const TBlob& out_grad, const TBlob& in_data, + const TBlob& out_data, const OpReqType& req, + const TBlob& in_grad) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), + in_grad.shape_, out_grad.shape_, + param.global_pool? + TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) + : param.kernel, + param.pad, + param.global_pool? TShape(param.kernel.ndim()) : param.stride, + param.pool_type, req, in_grad.dptr()); } template @@ -143,7 +129,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } @@ -164,7 +150,7 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - GetPoolingOp(param).Backward(ctx, + PoolingBackward(ctx, param, inputs[0], inputs[1], inputs[2], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; From 046eb81fa26e2ec722e61a5063f3417ed85c644e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 00:19:10 +0000 Subject: [PATCH 026/264] Make dropout stateless. --- src/operator/nn/dropout-inl.h | 131 ++++++++++++++++------------------ 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 222b0694824c..b206259277a4 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -93,80 +93,74 @@ struct DropoutParam : public dmlc::Parameter { }; // struct DropoutParam template -class DropoutOp { - public: - void Init(const DropoutParam ¶m) { - this->pkeep_ = 1.0f - param.p; - this->mode_ = param.mode; +void DropoutForward(const OpContext &ctx, const DropoutParam ¶m, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + real_t pkeep_ = 1.0f - param.p; + int mode_ = param.mode; + CHECK_EQ(in_data.size(), 1U); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 2U); } - - void Forward(const OpContext &ctx, const std::vector &in_data, - const std::vector &req, const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 2U); - } - Stream *s = ctx.get_stream(); - Tensor data = in_data[dropout::kData].FlatTo2D(s); - Tensor out = out_data[dropout::kOut].FlatTo2D(s); - if (ctx.is_train || mode_ == dropout::kAlways) { - Tensor mask = out_data[dropout::kMask].FlatTo2D(s); + Stream *s = ctx.get_stream(); + Tensor data = in_data[dropout::kData].FlatTo2D(s); + Tensor out = out_data[dropout::kOut].FlatTo2D(s); + if (ctx.is_train || mode_ == dropout::kAlways) { + Tensor mask = out_data[dropout::kMask].FlatTo2D(s); #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) - DType* outptr = out.dptr_; - DType* dataptr = data.dptr_; - auto maskptr = reinterpret_cast(mask.dptr_); - int count = mask.shape_[0]*mask.shape_[1]; - bernoulli_generate(count, this->pkeep_, maskptr); - const float pk_1 = 1.0f / pkeep_; - #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) - for (int i = 0; i < count; ++i) { - outptr[i] = dataptr[i] * maskptr[i] * pk_1; - } + DType* outptr = out.dptr_; + DType* dataptr = data.dptr_; + auto maskptr = reinterpret_cast(mask.dptr_); + int count = mask.shape_[0]*mask.shape_[1]; + bernoulli_generate(count, this->pkeep_, maskptr); + const float pk_1 = 1.0f / pkeep_; +#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) + for (int i = 0; i < count; ++i) { + outptr[i] = dataptr[i] * maskptr[i] * pk_1; + } #else - Random *prnd = ctx.requested[dropout::kRandom].get_random(s); - mask = tcast(F( - prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_)); - Assign(out, req[dropout::kOut], data * mask); + Random *prnd = ctx.requested[dropout::kRandom].get_random(s); + mask = tcast(F( + prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_)); + Assign(out, req[dropout::kOut], data * mask); #endif // USE_MKL && _OPENMP - } else { - Assign(out, req[dropout::kOut], F(data)); - } + } else { + Assign(out, req[dropout::kOut], F(data)); } +} - void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data_mask, const OpReqType &req, - const TBlob &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - Stream *s = ctx.get_stream(); - Tensor grad = out_grad.FlatTo2D(s); - Tensor mask = out_data_mask.FlatTo2D(s); - Tensor gdata = in_grad.FlatTo2D(s); - if (ctx.is_train || mode_ == dropout::kAlways) { +template +void DropoutBackward(const OpContext &ctx, const DropoutParam ¶m, + const TBlob &out_grad, const TBlob &out_data_mask, + const OpReqType &req, const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + real_t pkeep_ = 1.0f - param.p; + int mode_ = param.mode; + Stream *s = ctx.get_stream(); + Tensor grad = out_grad.FlatTo2D(s); + Tensor mask = out_data_mask.FlatTo2D(s); + Tensor gdata = in_grad.FlatTo2D(s); + if (ctx.is_train || mode_ == dropout::kAlways) { #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) - DType* ingradptr = gdata.dptr_; - DType* outgradptr = grad.dptr_; - auto maskptr = reinterpret_cast(mask.dptr_); - int count = mask.shape_[0]*mask.shape_[1]; - const float pk_1 = 1.0f / pkeep_; - #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) - for (int i = 0; i < count; ++i) { - ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1; - } + DType* ingradptr = gdata.dptr_; + DType* outgradptr = grad.dptr_; + auto maskptr = reinterpret_cast(mask.dptr_); + int count = mask.shape_[0]*mask.shape_[1]; + const float pk_1 = 1.0f / pkeep_; +#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) + for (int i = 0; i < count; ++i) { + ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1; + } #else // USE_MKL && _OPENMP - Assign(gdata, req, grad * mask); + Assign(gdata, req, grad * mask); #endif // USE_MKL && _OPENMP - } else { - Assign(gdata, req, F(grad)); - } + } else { + Assign(gdata, req, F(grad)); } - - private: - real_t pkeep_; - int mode_; -}; // class DropoutOp +} template void DropoutCompute(const nnvm::NodeAttrs& attrs, @@ -176,9 +170,7 @@ void DropoutCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { const DropoutParam& param = nnvm::get(attrs.parsed); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - static thread_local DropoutOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); + DropoutForward(ctx, param, inputs, req, outputs); }); } @@ -194,9 +186,8 @@ void DropoutGradCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(req.size(), 1); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - static thread_local DropoutOp op; - op.Init(param); - op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); + DropoutBackward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); }); } From f4c6f1c09903253b1d599830089098a3a38f8043 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 00:41:17 +0000 Subject: [PATCH 027/264] Make batchnorm stateless. --- src/operator/nn/batch_norm-inl.h | 203 ++++++++++++++----------------- src/operator/nn/batch_norm.cc | 40 +++--- src/operator/nn/batch_norm.cu | 50 ++++---- 3 files changed, 139 insertions(+), 154 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 1220156f1056..51361064e4c9 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -86,105 +86,21 @@ struct BatchNormParam : public dmlc::Parameter { } }; -/*! \brief Batch normalization operator */ -template -class BatchNormOp { - public: - void Init(BatchNormParam param) { - this->param_ = param; - } - - static inline bool IsWriting(const OpReqType ort) { - return ort == kWriteTo || ort == kWriteInplace; - } - - /*! - * \brief perform a forward operation of Operator, save the output to TBlob. - * \param ctx runtime context available to this call - * \param in_data array of input data, it is const - * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. - * \param out_data array of output data, pointer is used to indicate that this is holder - * the space of TBlob in out_data must be pre-allocated with InferShape - * \param aux_states Auxiliary states of operator. Normally operator doesn't - * need, epecial case like Batch Norm requires. - * \sa OpReqType, OpContext - */ - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(aux_states.size(), 2U); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(req.size(), 3U); - } else { - CHECK_GE(out_data.size(), 1U); - CHECK_GE(req.size(), 1U); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - Stream *s = ctx.get_stream(); - DoForward(s, ctx, in_data, req, out_data, aux_states); - } - - /*! - * \brief Perform a Backward Operation, write gradient to the in_grad. - * - * \note - * Convention: - * out_grad.size() == OperatorProperty.NumVisibleOutputs() - * out_data.size() == OperatorProperty.NumOutputs() - * out_data can contain additional invisible returns that remembers the - * state carried from the Forward pass. For example mask in the dropout. - * The gradients are passed from visible returns in this function. - * - * \par - * Not all the TBlobs in the arguments will be available - * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. - * Only the dependencies you declared will be available at corresponding position, - * the rest of the parameters are simply dummy where you will get a nullptr. - * You will be safe if you use the default DeclareBackwardDependency. - * But only declare what you need will give engine more chance for optimization. - * - * \param ctx runtime context available to this call - * \param out_grad the gradient value we get from of the Operator. - * \param in_data the array of input data. - * \param out_data the array of output data. - * \param req request types of the saving operation, can be all types. - * \param in_grad the array of gradient we need to write to. - * \param aux_states Auxiliary states of operator. Normally operator doesn't need - * \sa OperatorProperty, OpReqType, OpContext - */ - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U); - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(in_grad.size(), 3U); - mshadow::Stream *s = ctx.get_stream(); - DoBackward(s, ctx, out_grad, in_data, - out_data, req, in_grad, aux_states); - } +static inline bool IsBNWriting(const OpReqType ort) { + return ort == kWriteTo || ort == kWriteInplace; +} - private: - void DoForward(mshadow::Stream *stream, - const OpContext &ctx, +template +void DoBNForward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, const std::vector &in_data, const std::vector &req, const std::vector &out_data, const std::vector &aux_states); - void DoBackward(mshadow::Stream *stream, - const OpContext &ctx, +template +void DoBNBackward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -193,14 +109,16 @@ class BatchNormOp { const std::vector &aux_states); #if MXNET_USE_CUDA - void DoForward(mshadow::Stream *stream, - const OpContext &ctx, +template +void DoBNForward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, const std::vector &in_data, const std::vector &req, const std::vector &out_data, const std::vector &aux_states); - void DoBackward(mshadow::Stream *stream, - const OpContext &ctx, +template +void DoBNBackward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -209,15 +127,83 @@ class BatchNormOp { const std::vector &aux_states); #endif // MXNET_USE_CUDA - /*! \brief Batch normalization operator parameters */ - BatchNormParam param_; -}; // class BatchNormOp +/*! + * \brief perform a forward operation of Operator, save the output to TBlob. + * \param ctx runtime context available to this call + * \param in_data array of input data, it is const + * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. + * \param out_data array of output data, pointer is used to indicate that this is holder + * the space of TBlob in out_data must be pre-allocated with InferShape + * \param aux_states Auxiliary states of operator. Normally operator doesn't + * need, epecial case like Batch Norm requires. + * \sa OpReqType, OpContext + */ +template +void BNForward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(aux_states.size(), 2U); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(req.size(), 3U); + } else { + CHECK_GE(out_data.size(), 1U); + CHECK_GE(req.size(), 1U); + CHECK_EQ(req[batchnorm::kOut], kWriteTo); + } + Stream *s = ctx.get_stream(); + DoBNForward(s, ctx, param, in_data, req, out_data, aux_states); +} -template -static BatchNormOp &GetBatchNormOp(const BatchNormParam& param) { - static thread_local BatchNormOp op; - op.Init(param); - return op; +/*! + * \brief Perform a Backward Operation, write gradient to the in_grad. + * + * \note + * Convention: + * out_grad.size() == OperatorProperty.NumVisibleOutputs() + * out_data.size() == OperatorProperty.NumOutputs() + * out_data can contain additional invisible returns that remembers the + * state carried from the Forward pass. For example mask in the dropout. + * The gradients are passed from visible returns in this function. + * + * \par + * Not all the TBlobs in the arguments will be available + * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. + * Only the dependencies you declared will be available at corresponding position, + * the rest of the parameters are simply dummy where you will get a nullptr. + * You will be safe if you use the default DeclareBackwardDependency. + * But only declare what you need will give engine more chance for optimization. + * + * \param ctx runtime context available to this call + * \param out_grad the gradient value we get from of the Operator. + * \param in_data the array of input data. + * \param out_data the array of output data. + * \param req request types of the saving operation, can be all types. + * \param in_grad the array of gradient we need to write to. + * \param aux_states Auxiliary states of operator. Normally operator doesn't need + * \sa OperatorProperty, OpReqType, OpContext + */ +template +void BNBackward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(in_grad.size(), 3U); + mshadow::Stream *s = ctx.get_stream(); + DoBNBackward(s, ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); } template @@ -232,8 +218,7 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, std::vector aux_states(inputs.begin() + (int) batchnorm::kInMovingMean, inputs.end()); MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - GetBatchNormOp(param).Forward(ctx, in_data, - req, outputs, aux_states); + BNForward(ctx, param, in_data, req, outputs, aux_states); }); } @@ -257,8 +242,8 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, std::vector in_grad(outputs.begin(), outputs.begin() + 3); MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - GetBatchNormOp(param).Backward(ctx, out_grad, - in_data, out_data, req, in_grad, aux_states); + BNBackward(ctx, param, out_grad, in_data, out_data, req, + in_grad, aux_states); }); } diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index bbf4da9874c4..443f18fe42b1 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -90,12 +90,12 @@ static inline void ForEachFast(const BNTensor3 &in_data, /*! \brief Forward CPU */ template -void BatchNormOp::DoForward(mshadow::Stream *, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void DoBNForward(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { // Input batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -165,7 +165,7 @@ void BatchNormOp::DoForward(mshadow::Stream *, // note that var is still invstd if (!param_.fix_gamma) { - if (IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -174,10 +174,10 @@ void BatchNormOp::DoForward(mshadow::Stream *, }); } } else { - if (IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { w[channel] = AccReal(1); } - if (IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -190,14 +190,14 @@ void BatchNormOp::DoForward(mshadow::Stream *, } template -void BatchNormOp::DoBackward(mshadow::Stream *, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void DoBNBackward(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { // Input Data batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -265,7 +265,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, dotp += (*thisInputData - mean) * (*gradOut_data); }); - if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) { // if there's a grad input + if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) { // if there's a grad input if (is_train_and_not_global_stats) { // when in training mode // Q(X) = X - E[x] ; i.e. input centered to zero mean @@ -301,7 +301,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, // May want to make this a param eventually const AccReal scale = 1.0f; - if (IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { if (!param_.fix_gamma) { gradWeightData[channel] = scale * dotp * invstd; } else { @@ -309,7 +309,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, } } - if (IsWriting(req[batchnorm::kBeta])) { + if (IsBNWriting(req[batchnorm::kBeta])) { gradBiasData[channel] = scale * sumGradOut; } } diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 076ea3034b23..25c8044cab11 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx, flags |= ctx.is_train ? IS_TRAINING_FLAG : 0; flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0; flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0; - if (BatchNormOp::IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { flags |= WRITE_DATA_FLAG; } - if (BatchNormOp::IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { flags |= WRITE_GAMMA_FLAG; } - if (BatchNormOp::IsWriting(req[batchnorm::kBeta])) { + if (IsBNWriting(req[batchnorm::kBeta])) { flags |= WRITE_BETA_FLAG; } return flags; @@ -593,16 +593,16 @@ static inline uint32_t SetupFlags(const OpContext &ctx, /*! \brief Forward batch-norm pass on GPU */ template -void BatchNormOp::DoForward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void DoBNForward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationUpdateOutput( stream, ctx, - param_, + param, in_data, out_data, aux_states, @@ -614,18 +614,18 @@ void BatchNormOp::DoForward(mshadow::Stream *stream, /*! \brief Backward batch-norm pass on GPU */ template -void BatchNormOp::DoBackward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void DoBNBackward(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationBackward( stream, ctx, - param_, + param, out_grad, in_data, out_data, @@ -646,9 +646,9 @@ static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { template<> void BatchNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { BatchNormParam param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 5U); std::vector in_data(inputs.begin(), inputs.begin() + 3); @@ -677,9 +677,9 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, template<> void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { CHECK_EQ(inputs.size(), 11U); BatchNormParam param = nnvm::get(attrs.parsed); std::vector out_grad(1, inputs[0]); From 30b5fd997be40e42d7e49eca0af5c73263ab2fc3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 27 Nov 2017 23:54:18 +0000 Subject: [PATCH 028/264] Make SoftmaxActivation stateless. --- src/operator/nn/softmax_activation-inl.h | 124 +++++++++-------------- src/operator/nn/softmax_activation.cu | 14 +-- 2 files changed, 51 insertions(+), 87 deletions(-) diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h index 5b91b6f79e98..b1d542e4068c 100644 --- a/src/operator/nn/softmax_activation-inl.h +++ b/src/operator/nn/softmax_activation-inl.h @@ -61,98 +61,72 @@ struct SoftmaxActivationParam : public dmlc::Parameter { } }; -/** - * \brief This is the implementation of softmax_activation operator. - * \tparam xpu The device that the op will be executed on. - */ -template -class SoftmaxActivationOp { - public: - void Init(SoftmaxActivationParam p) { - this->param_ = p; - } - - void Forward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - Stream *s = ctx.get_stream(); - if (param_.mode == softmax_activation::kInstance) { - Tensor data = in_data.FlatTo2D(s); - Tensor out = out_data.FlatTo2D(s); - Softmax(out, data); - } else { - CHECK_GE(in_data.ndim(), 3) - << "Input need to have a least 3 dimensions when mode=channel"; - int n = in_data.size(0); - int k = in_data.size(1); - Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); - Tensor data = in_data.get_with_shape(s3, s); - Tensor out = out_data.get_with_shape(s3, s); - Softmax(out, data); - } - } - - void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - // Use 3d tensor for both mode -> {instance, channel}. Get shapes - int total_size = in_grad.Size(); - int batch_size = in_grad.shape_[0]; - int channel_num = in_grad.shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); - // Get tensors - Stream *s = ctx.get_stream(); - Tensor m_out_grad = - out_grad.get_with_shape(data_shape, s); - Tensor m_out_data = - out_data.get_with_shape(data_shape, s); - Tensor m_in_grad = - in_grad.get_with_shape(data_shape, s); - // get requested temp space - Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( - Shape2(batch_size, rest_size), s); - workspace = reduce_with_axis(m_out_grad * m_out_data, 1); - Assign(m_in_grad, req, - m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); - } - - private: - SoftmaxActivationParam param_; -}; // class SoftmaxActivationOp - - template void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, - const std::vector& req, + const std::vector& reqs, const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); - - static thread_local SoftmaxActivationOp op; - op.Init(param); - op.Forward(ctx, inputs[0], req[0], outputs[0]); + const TBlob &in_data = inputs[softmax_activation::kData]; + const OpReqType &req = reqs[softmax_activation::kOut]; + const TBlob &out_data = outputs[softmax_activation::kOut]; + Stream *s = ctx.get_stream(); + if (param.mode == softmax_activation::kInstance) { + Tensor data = in_data.FlatTo2D(s); + Tensor out = out_data.FlatTo2D(s); + Softmax(out, data); + } else { + CHECK_GE(in_data.ndim(), 3) + << "Input need to have a least 3 dimensions when mode=channel"; + int n = in_data.size(0); + int k = in_data.size(1); + Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); + Tensor data = in_data.get_with_shape(s3, s); + Tensor out = out_data.get_with_shape(s3, s); + Softmax(out, data); + } } template void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, - const std::vector& req, + const std::vector& reqs, const std::vector& outputs) { - const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + using namespace mshadow::expr; CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1); - CHECK_EQ(req.size(), 1); - - static thread_local SoftmaxActivationOp op; - op.Init(param); - op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); + CHECK_EQ(reqs.size(), 1); + const TBlob &out_grad = inputs[0]; + const TBlob &out_data = inputs[1]; + const OpReqType &req = reqs[0]; + const TBlob &in_grad = outputs[0]; + // Use 3d tensor for both mode -> {instance, channel}. Get shapes + int total_size = in_grad.Size(); + int batch_size = in_grad.shape_[0]; + int channel_num = in_grad.shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); + // Get tensors + Stream *s = ctx.get_stream(); + Tensor m_out_grad = + out_grad.get_with_shape(data_shape, s); + Tensor m_out_data = + out_data.get_with_shape(data_shape, s); + Tensor m_in_grad = + in_grad.get_with_shape(data_shape, s); + // get requested temp space + Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( + Shape2(batch_size, rest_size), s); + workspace = reduce_with_axis(m_out_grad * m_out_data, 1); + Assign(m_in_grad, req, + m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); } } // namespace op diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index 8e6e787f8072..1cfe64f7e916 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -32,6 +32,7 @@ namespace mxnet { namespace op { +#if MXNET_USE_CUDNN == 1 template<> void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -42,15 +43,9 @@ void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); -#if MXNET_USE_CUDNN == 1 static thread_local CuDNNSoftmaxActivationOp op; op.Init(param); op.Forward(ctx, inputs[0], req[0], outputs[0]); -#else - static thread_local SoftmaxActivationOp op; - op.Init(param); - op.Forward(ctx, inputs[0], req[0], outputs[0]); -#endif } template<> @@ -64,16 +59,11 @@ void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1); CHECK_EQ(req.size(), 1); -#if MXNET_USE_CUDNN == 1 static thread_local CuDNNSoftmaxActivationOp op; op.Init(param); op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); -#else - static thread_local SoftmaxActivationOp op; - op.Init(param); - op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); -#endif } +#endif NNVM_REGISTER_OP(SoftmaxActivation) .set_attr("FCompute", SoftmaxActivationCompute); From 95ef90e7eca95020b29dde3ba35a4faec7b5988e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 01:15:47 +0000 Subject: [PATCH 029/264] Fix a code style problem. --- src/operator/nn/batch_norm-inl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 51361064e4c9..a6b11fc647f6 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -214,8 +214,8 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, const BatchNormParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 5U); std::vector in_data(inputs.begin(), - inputs.begin() + (int) batchnorm::kInMovingMean); - std::vector aux_states(inputs.begin() + (int) batchnorm::kInMovingMean, + inputs.begin() + batchnorm::kInMovingMean); + std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { BNForward(ctx, param, in_data, req, outputs, aux_states); @@ -231,8 +231,8 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, const BatchNormParam& param = nnvm::get(attrs.parsed); int num_out_grads = param.output_mean_var ? 3U : 1U; int in_data_start = 3; - int aux_states_start = in_data_start + (int) batchnorm::kInMovingMean; - int out_data_start = in_data_start + (int) batchnorm::kInMovingVar + 1; + int aux_states_start = in_data_start + batchnorm::kInMovingMean; + int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); std::vector in_data(inputs.begin() + in_data_start, inputs.begin() + aux_states_start); From 921859a803b07ff526f4195a22da8b990d646a94 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 02:29:06 +0000 Subject: [PATCH 030/264] pass amalgamation test for batch norm. --- src/operator/nn/batch_norm.cc | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 443f18fe42b1..5db8d1a1a0e7 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -349,14 +349,6 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs, return true; } -static inline std::vector ListArguments() { - return {"data", "gamma", "beta"}; -} - -static inline std::vector ListOutputs() { - return {"output", "mean", "var"}; -} - static bool BatchNormType(const nnvm::NodeAttrs& attrs, std::vector *in_type, std::vector *out_type) { using namespace mshadow; @@ -369,14 +361,16 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs, int dtype_param; MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType::kFlag; }); + std::vector args{"data", "gamma", "beta"}; + CHECK_LE(in_type->size(), args.size()); for (index_t i = 1; i < in_type->size(); ++i) { if ((*in_type)[i] == -1) { (*in_type)[i] = dtype_param; } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); + UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]); } } - const size_t n_out = ListOutputs().size(); + const size_t n_out = 3; out_type->clear(); out_type->push_back(dtype); for (size_t i = 1; i < n_out; ++i) { From 485f58f9bffdfaf42df1883d610427c8410fe21b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 02:29:33 +0000 Subject: [PATCH 031/264] pass amalgamation test for dropout. --- src/operator/nn/dropout.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc index b28113ce9322..9facb6d26680 100644 --- a/src/operator/nn/dropout.cc +++ b/src/operator/nn/dropout.cc @@ -40,10 +40,6 @@ struct DropoutGrad { } }; -std::vector ListOutputs() { - return std::vector{"output", "mask"}; -} - DMLC_REGISTER_PARAMETER(DropoutParam); NNVM_REGISTER_OP(Dropout) @@ -86,7 +82,7 @@ Example:: }) .set_attr("FListOutputNames", [](const NodeAttrs& attrs) { - return ListOutputs(); + return std::vector{"output", "mask"}; }) .set_attr("FNumVisibleOutputs", [](const NodeAttrs& attrs) { @@ -113,7 +109,7 @@ Example:: return false; } - size_t nout = ListOutputs().size(); + size_t nout = 2; out_type->clear(); for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); return true; From 660968f8aef025c36bacfc24b97bdc4fc7ae3c41 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 02:55:29 +0000 Subject: [PATCH 032/264] Get convolution ops from a function. --- src/operator/nn/convolution-inl.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 84babed946b3..bb9d4eb0973a 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -385,6 +385,13 @@ class ConvolutionOp { bool is_1x1_; }; // class ConvolutionOp +template +ConvolutionOp &GetConvolutionOp(const ConvolutionParam& param) { + static thread_local ConvolutionOp op; + op.Init(param); + return op; +} + template void ConvolutionCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, @@ -392,9 +399,7 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { - static thread_local ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); + GetConvolutionOp(param).Forward(ctx, inputs, req, outputs); }); } @@ -409,9 +414,8 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, const std::vector &in_grad = outputs; MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - static thread_local ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + GetConvolutionOp(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); }); } From 26e9430daee5a2d1cbd080a4486bc102d4879363 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 02:58:58 +0000 Subject: [PATCH 033/264] Fix compilation errors for GPU. --- src/operator/nn/activation.cu | 4 ++-- src/operator/nn/batch_norm.cu | 18 ++++++++++-------- src/operator/nn/convolution.cu | 2 ++ src/operator/nn/deconvolution.cu | 2 ++ src/operator/nn/dropout-inl.h | 2 +- src/operator/nn/fully_connected.cu | 6 ++---- src/operator/nn/pooling.cu | 6 +++--- 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index 0dea6c3bb5a4..238e23370180 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -54,7 +54,7 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs, // SoftReLU not supported by CUDNN yet if (param.act_type == activation::kSoftReLU) { MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - get_activation_op().Forward(ctx, + ActivationForward(ctx, inputs[0], req[0], outputs[0]); }); } else { @@ -78,7 +78,7 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs, // SoftReLU not supported by CUDNN yet if (param.act_type == activation::kSoftReLU) { MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - get_activation_op().Backward( + ActivationBackward( ctx, inputs[0], inputs[1], req[0], outputs[0]); }); } else { diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 25c8044cab11..9b1f5d6c9183 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -594,7 +594,7 @@ static inline uint32_t SetupFlags(const OpContext &ctx, /*! \brief Forward batch-norm pass on GPU */ template void DoBNForward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, + const OpContext &ctx, const BatchNormParam& param_, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -602,7 +602,7 @@ void DoBNForward(mshadow::Stream *stream, batchnorm::cuda::BatchNormalizationUpdateOutput( stream, ctx, - param, + param_, in_data, out_data, aux_states, @@ -615,7 +615,7 @@ void DoBNForward(mshadow::Stream *stream, /*! \brief Backward batch-norm pass on GPU */ template void DoBNBackward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, + const OpContext &ctx, const BatchNormParam& param_, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -625,7 +625,7 @@ void DoBNBackward(mshadow::Stream *stream, batchnorm::cuda::BatchNormalizationBackward( stream, ctx, - param, + param_, out_grad, in_data, out_data, @@ -637,12 +637,14 @@ void DoBNBackward(mshadow::Stream *stream, MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu); } +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 template static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { static thread_local CuDNNBatchNormOp op; op.Init(param); return op; } +#endif template<> void BatchNormCompute(const nnvm::NodeAttrs& attrs, @@ -665,12 +667,12 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); + BNForward(ctx, param, in_data, req, outputs, aux_states); }) } #else MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - GetBatchNormOp(param).Forward(ctx, in_data, req, outputs, aux_states); + BNForward(ctx, param, in_data, req, outputs, aux_states); }); #endif } @@ -700,13 +702,13 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - GetBatchNormOp(param).Backward(ctx, out_grad, + BNBackward(ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); }) } #else MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - GetBatchNormOp(param).Backward(ctx, out_grad, + BNBackward(ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); }); #endif diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 50b4b04ff354..932155a3e636 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -42,6 +42,7 @@ static ConvolutionOp &get_op(const ConvolutionParam& param) { return op; } +#if MXNET_USE_CUDNN == 1 template static CuDNNConvolutionOp &get_cudnn_op(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, @@ -52,6 +53,7 @@ static CuDNNConvolutionOp &get_cudnn_op(const ConvolutionParam& param, in_shape, out_shape, ctx); return op; } +#endif template<> void ConvolutionCompute(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 2812e4f46e12..9219e95d9f06 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -39,6 +39,7 @@ static DeconvolutionOp &get_op(const DeconvolutionParam& param) { return op; } +#if MXNET_USE_CUDNN == 1 template static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param, int forward_compute_type, @@ -54,6 +55,7 @@ static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); return op; } +#endif template<> void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index b206259277a4..a6b142b93973 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -137,7 +137,6 @@ void DropoutBackward(const OpContext &ctx, const DropoutParam ¶m, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - real_t pkeep_ = 1.0f - param.p; int mode_ = param.mode; Stream *s = ctx.get_stream(); Tensor grad = out_grad.FlatTo2D(s); @@ -145,6 +144,7 @@ void DropoutBackward(const OpContext &ctx, const DropoutParam ¶m, Tensor gdata = in_grad.FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) + real_t pkeep_ = 1.0f - param.p; DType* ingradptr = gdata.dptr_; DType* outgradptr = grad.dptr_; auto maskptr = reinterpret_cast(mask.dptr_); diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu index 7637865f2472..c89d37767c4a 100644 --- a/src/operator/nn/fully_connected.cu +++ b/src/operator/nn/fully_connected.cu @@ -39,8 +39,7 @@ void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, int dtype = inputs[0].type_flag_; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - FullyConnectedOp::get_op(param).Forward(ctx, inputs, - req, outputs); + FCForward(ctx, param, inputs, req, outputs); }); } @@ -61,8 +60,7 @@ void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, int dtype = inputs[0].type_flag_; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - FullyConnectedOp::get_op(param).Backward(ctx, out_grad, in_data, - req, outputs); + FCBackward(ctx, param, out_grad, in_data, req, outputs); }); } diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index 24aa4178b3c7..afe4f3c413f0 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -71,7 +71,7 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - GetPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } @@ -110,8 +110,8 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - GetPoolingOp(param).Backward(ctx, - inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + PoolingBackward(ctx, param, inputs[0], + inputs[1], inputs[2], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } From 5504e2c393495253119ce2bbd09a552867e023bf Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 18:49:36 +0000 Subject: [PATCH 034/264] Fix thread local in diff platforms. --- src/operator/nn/activation.cu | 4 +++ src/operator/nn/batch_norm.cu | 4 +++ src/operator/nn/convolution-inl.h | 4 +++ src/operator/nn/convolution.cu | 42 +++++++++++++---------- src/operator/nn/cudnn/cudnn_batch_norm.cu | 4 +++ src/operator/nn/deconvolution-inl.h | 20 +++++++---- src/operator/nn/deconvolution.cu | 29 ++++++---------- src/operator/nn/pooling.cu | 4 +++ src/operator/nn/softmax_activation.cu | 21 +++++++----- 9 files changed, 81 insertions(+), 51 deletions(-) diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index 238e23370180..dc435b2acc17 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -36,7 +36,11 @@ namespace op { template static CuDNNActivationOp &get_cudnn_op(const ActivationParam& param) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local CuDNNActivationOp cudnn_op; +#else + static MX_THREAD_LOCAL CuDNNActivationOp cudnn_op; +#endif cudnn_op.Init(param); return cudnn_op; } diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 9b1f5d6c9183..682c286f4a3a 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -640,7 +640,11 @@ void DoBNBackward(mshadow::Stream *stream, #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 template static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local CuDNNBatchNormOp op; +#else + static MX_THREAD_LOCAL CuDNNBatchNormOp op; +#endif op.Init(param); return op; } diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index bb9d4eb0973a..93e4d30fa39e 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -387,7 +387,11 @@ class ConvolutionOp { template ConvolutionOp &GetConvolutionOp(const ConvolutionParam& param) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local ConvolutionOp op; +#else + static MX_THREAD_LOCAL ConvolutionOp op; +#endif op.Init(param); return op; } diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 932155a3e636..35a3f80b96a3 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -34,21 +34,17 @@ namespace mxnet { namespace op { -// This is to maintain one copy for each type. -template -static ConvolutionOp &get_op(const ConvolutionParam& param) { - static thread_local ConvolutionOp op; - op.Init(param); - return op; -} - #if MXNET_USE_CUDNN == 1 template -static CuDNNConvolutionOp &get_cudnn_op(const ConvolutionParam& param, +static CuDNNConvolutionOp &GetCuDNNConvOp(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, const Context& ctx) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local CuDNNConvolutionOp op; +#else + static MX_THREAD_LOCAL CuDNNConvolutionOp op; +#endif op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); return op; @@ -66,7 +62,7 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, // If 1D convolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Forward(ctx, inputs, req, outputs); }) return; @@ -76,7 +72,11 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local DepthwiseConvolutionOp op; +#else + static MX_THREAD_LOCAL DepthwiseConvolutionOp op; +#endif std::vector in_shape(inputs.size()); std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) @@ -92,26 +92,26 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Forward(ctx, inputs, req, outputs); } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Forward(ctx, inputs, req, outputs); } else { std::vector in_shape(inputs.size()); std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = inputs[i].shape_; - CuDNNConvolutionOp &op = get_cudnn_op(param, + CuDNNConvolutionOp &op = GetCuDNNConvOp(param, compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); op.Forward(ctx, inputs, req, outputs); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Forward(ctx, inputs, req, outputs); }) #endif // MXNET_USE_CUDNN @@ -131,7 +131,7 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, // If 1D convolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) return; @@ -141,7 +141,11 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local DepthwiseConvolutionOp op; +#else + static MX_THREAD_LOCAL DepthwiseConvolutionOp op; +#endif // The first element stores out grad. std::vector in_shape(in_data.size()); std::vector out_shape(1, out_grad.shape_); @@ -158,12 +162,12 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else { // The first element stores out grad. @@ -171,14 +175,14 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, std::vector out_shape(1, out_grad.shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = in_data[i].shape_; - CuDNNConvolutionOp &op = get_cudnn_op(param, + CuDNNConvolutionOp &op = GetCuDNNConvOp(param, compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = get_op(param); + ConvolutionOp &op = GetConvolutionOp(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu index c929ab2e2878..e07cd1e6c8f6 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cu +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu @@ -33,7 +33,11 @@ namespace op { template static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local CuDNNBatchNormOp op; +#else + static MX_THREAD_LOCAL CuDNNBatchNormOp op; +#endif op.Init(param); return op; } diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index d97eb0ab4304..5f2babb27e36 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -453,15 +453,24 @@ class DeconvolutionOp { index_t nstep_; }; // class DeconvolutionOp +template +DeconvolutionOp &GetDeconvolutionOp(const DeconvolutionParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local DeconvolutionOp op; +#else + static MX_THREAD_LOCAL DeconvolutionOp op; +#endif + op.Init(param); + return op; +} + template void _DeconvolutionCompute(const DeconvolutionParam& param, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - static thread_local DeconvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); + GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); }); } @@ -484,9 +493,8 @@ void _DeconvolutionGradCompute(const DeconvolutionParam& param, const std::vector &in_grad = outputs; MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - static thread_local DeconvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, + in_data, req, in_grad); }); } diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 9219e95d9f06..16412a38b1d4 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -32,16 +32,9 @@ namespace mxnet { namespace op { -template -static DeconvolutionOp &get_op(const DeconvolutionParam& param) { - static thread_local DeconvolutionOp op; - op.Init(param); - return op; -} - #if MXNET_USE_CUDNN == 1 template -static CuDNNDeconvolutionOp &get_cudnn_op(const DeconvolutionParam& param, +static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -68,7 +61,7 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, // If 1D deconvolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - get_op(param).Forward(ctx, inputs, req, outputs); + GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); }) return; } @@ -78,25 +71,25 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - get_op(param).Forward(ctx, inputs, req, outputs); + GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - get_op(param).Forward(ctx, inputs, req, outputs); + GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); } else { std::vector in_shape(inputs.size()); std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = inputs[i].shape_; } - get_cudnn_op(param, compute_type, compute_type, + GetCuDNNDeconvOp(param, compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx, false).Forward(ctx, inputs, req, outputs); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - get_op(param).Forward(ctx, inputs, req, outputs); + GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); }) #endif // MXNET_USE_CUDNN } @@ -116,7 +109,7 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, // If 1D deconvolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - get_op(param).Backward(ctx, std::vector{out_grad}, + GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) return; @@ -127,13 +120,13 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - get_op(param).Backward(ctx, std::vector{out_grad}, + GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - get_op(param).Backward(ctx, std::vector{out_grad}, + GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else { std::vector in_shape(in_data.size()); @@ -141,14 +134,14 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = in_data[i].shape_; } - get_cudnn_op(param, compute_type, compute_type, + GetCuDNNDeconvOp(param, compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx, true).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - get_op(param).Backward(ctx, std::vector{out_grad}, + GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index afe4f3c413f0..de7dbf12606d 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -35,7 +35,11 @@ namespace op { #if MXNET_USE_CUDNN == 1 template static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { +#if DMLC_CXX11_THREAD_LOCAL static thread_local CuDNNPoolingOp op; +#else + static MX_THREAD_LOCAL CuDNNPoolingOp op; +#endif op.Init(param); return op; } diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index 1cfe64f7e916..f3997e00052e 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -33,6 +33,17 @@ namespace mxnet { namespace op { #if MXNET_USE_CUDNN == 1 + +static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNSoftmaxActivationOp op; +#else + static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op; +#endif + op.Init(param); + return op; +} + template<> void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -42,10 +53,7 @@ void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); - - static thread_local CuDNNSoftmaxActivationOp op; - op.Init(param); - op.Forward(ctx, inputs[0], req[0], outputs[0]); + GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); } template<> @@ -58,10 +66,7 @@ void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1); CHECK_EQ(req.size(), 1); - - static thread_local CuDNNSoftmaxActivationOp op; - op.Init(param); - op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); + GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); } #endif From 6324176a26c2c250bae3608c2ef13698584d2373 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 20:58:21 +0000 Subject: [PATCH 035/264] Avoid using thread_local for non-CuDNN conv/deconv. --- src/operator/nn/convolution-inl.h | 20 ++++-------- src/operator/nn/convolution.cu | 36 +++++++++++----------- src/operator/nn/deconvolution-inl.h | 20 ++++-------- src/operator/nn/deconvolution.cu | 48 ++++++++++++++++++----------- 4 files changed, 60 insertions(+), 64 deletions(-) diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 93e4d30fa39e..bc54996c3ea4 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -385,17 +385,6 @@ class ConvolutionOp { bool is_1x1_; }; // class ConvolutionOp -template -ConvolutionOp &GetConvolutionOp(const ConvolutionParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local ConvolutionOp op; -#else - static MX_THREAD_LOCAL ConvolutionOp op; -#endif - op.Init(param); - return op; -} - template void ConvolutionCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, @@ -403,7 +392,9 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { - GetConvolutionOp(param).Forward(ctx, inputs, req, outputs); + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); }); } @@ -418,8 +409,9 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, const std::vector &in_grad = outputs; MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - GetConvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }); } diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 35a3f80b96a3..d7f9e564a603 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -62,7 +62,8 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, // If 1D convolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Forward(ctx, inputs, req, outputs); }) return; @@ -72,15 +73,11 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local DepthwiseConvolutionOp op; -#else - static MX_THREAD_LOCAL DepthwiseConvolutionOp op; -#endif std::vector in_shape(inputs.size()); std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = inputs[i].shape_; + DepthwiseConvolutionOp op; op.Init(param, in_shape, out_shape); op.Forward(ctx, inputs, req, outputs); return; @@ -92,12 +89,14 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Forward(ctx, inputs, req, outputs); } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Forward(ctx, inputs, req, outputs); } else { std::vector in_shape(inputs.size()); @@ -111,7 +110,8 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Forward(ctx, inputs, req, outputs); }) #endif // MXNET_USE_CUDNN @@ -131,7 +131,8 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, // If 1D convolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) return; @@ -141,16 +142,12 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local DepthwiseConvolutionOp op; -#else - static MX_THREAD_LOCAL DepthwiseConvolutionOp op; -#endif // The first element stores out grad. std::vector in_shape(in_data.size()); std::vector out_shape(1, out_grad.shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = in_data[i].shape_; + DepthwiseConvolutionOp op; op.Init(param, in_shape, out_shape); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); return; @@ -162,12 +159,14 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else { // The first element stores out grad. @@ -182,7 +181,8 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp &op = GetConvolutionOp(param); + ConvolutionOp op; + op.Init(param); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 5f2babb27e36..3dddee7daa46 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -453,24 +453,15 @@ class DeconvolutionOp { index_t nstep_; }; // class DeconvolutionOp -template -DeconvolutionOp &GetDeconvolutionOp(const DeconvolutionParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local DeconvolutionOp op; -#else - static MX_THREAD_LOCAL DeconvolutionOp op; -#endif - op.Init(param); - return op; -} - template void _DeconvolutionCompute(const DeconvolutionParam& param, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); }); } @@ -493,8 +484,9 @@ void _DeconvolutionGradCompute(const DeconvolutionParam& param, const std::vector &in_grad = outputs; MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }); } diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 16412a38b1d4..8e6f4b677a2c 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -35,12 +35,12 @@ namespace op { #if MXNET_USE_CUDNN == 1 template static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& param, - int forward_compute_type, - int backward_compute_type, - const std::vector& in_shape, - const std::vector& out_shape, - const Context& ctx, bool backward) { - // Convolution forward has to be called before backward for this operator. + int forward_compute_type, + int backward_compute_type, + const std::vector& in_shape, + const std::vector& out_shape, + const Context& ctx, bool backward) { + // TODO (zhengda) Convolution forward has to be called before backward for this operator. // So we can't make this operator thread local. backward might be called // in another thread. static CuDNNDeconvolutionOp op; @@ -61,7 +61,9 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, // If 1D deconvolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); }) return; } @@ -71,12 +73,16 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); } else { std::vector in_shape(inputs.size()); std::vector out_shape(1, outputs[0].shape_); @@ -89,7 +95,9 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetDeconvolutionOp(param).Forward(ctx, inputs, req, outputs); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); }) #endif // MXNET_USE_CUDNN } @@ -109,8 +117,9 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, // If 1D deconvolution, use MXNet implementation if (param.kernel.ndim() == 1) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) return; } @@ -120,14 +129,16 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } else { std::vector in_shape(in_data.size()); std::vector out_shape(1, out_grad.shape_); @@ -141,8 +152,9 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetDeconvolutionOp(param).Backward(ctx, std::vector{out_grad}, - in_data, req, in_grad); + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN } From 36c466f83824b88dd447b945875dfa65d630d352 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 21:05:24 +0000 Subject: [PATCH 036/264] Remove TODO in deconv. --- src/operator/nn/deconvolution.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 8e6f4b677a2c..e688e49ab20d 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -40,7 +40,7 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p const std::vector& in_shape, const std::vector& out_shape, const Context& ctx, bool backward) { - // TODO (zhengda) Convolution forward has to be called before backward for this operator. + // Convolution forward has to be called before backward for this operator. // So we can't make this operator thread local. backward might be called // in another thread. static CuDNNDeconvolutionOp op; From 64106840667598105d61f6bf497afdd4d444a2b6 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 28 Nov 2017 21:34:01 +0000 Subject: [PATCH 037/264] Fix a compilation error in dropout. --- src/operator/nn/dropout-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index a6b142b93973..343201062dbe 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -114,7 +114,7 @@ void DropoutForward(const OpContext &ctx, const DropoutParam ¶m, DType* dataptr = data.dptr_; auto maskptr = reinterpret_cast(mask.dptr_); int count = mask.shape_[0]*mask.shape_[1]; - bernoulli_generate(count, this->pkeep_, maskptr); + bernoulli_generate(count, pkeep_, maskptr); const float pk_1 = 1.0f / pkeep_; #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) for (int i = 0; i < count; ++i) { From 1fa389840847df977c12fb3c2736a5d7ae42159e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 01:40:51 +0000 Subject: [PATCH 038/264] Fix a bug in batch norm. --- src/operator/nn/batch_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 5db8d1a1a0e7..bb5a70658d21 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -361,7 +361,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs, int dtype_param; MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType::kFlag; }); - std::vector args{"data", "gamma", "beta"}; + std::vector args{"data", "gamma", "beta", "mean", "var"}; CHECK_LE(in_type->size(), args.size()); for (index_t i = 1; i < in_type->size(); ++i) { if ((*in_type)[i] == -1) { From 588383a35722e75b282c9dee7b19976418d23ffa Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 01:41:10 +0000 Subject: [PATCH 039/264] Fix a bug in fully connected. --- src/operator/nn/fully_connected.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index cc475e04dd44..c4edf6dcab9b 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -137,7 +137,10 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. .add_arguments(FullyConnectedParam::__FIELDS__()); NNVM_REGISTER_OP(_backward_FullyConnected) -.set_num_outputs(3) +.set_num_outputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) .set_attr("TIsBackward", true) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{1, 0}}; From 66a281a39f96779a51bb00ecc2a076c7d7292dc4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 01:42:00 +0000 Subject: [PATCH 040/264] Don't set #inputs for backward convolution. --- src/operator/nn/convolution.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 2c010674314d..60c56d69d340 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -422,7 +422,6 @@ There are other options to tune the performance. .add_arguments(ConvolutionParam::__FIELDS__()); NNVM_REGISTER_OP(_backward_Convolution) -.set_num_inputs(3) .set_num_outputs([](const NodeAttrs& attrs) { const ConvolutionParam& params = nnvm::get(attrs.parsed); return params.no_bias ? 2 : 3; From d3ce902e77814a6ccf097648a10d68c777811996 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 20 Oct 2017 16:48:49 -0700 Subject: [PATCH 041/264] Remove MKL code. --- src/operator/mkl/mkl_batch_norm-inl.h | 391 ------- src/operator/mkl/mkl_concat-inl.h | 314 ------ src/operator/mkl/mkl_convolution-inl.h | 490 --------- src/operator/mkl/mkl_cppwrapper.cc | 44 - src/operator/mkl/mkl_cppwrapper.h | 1020 ------------------- src/operator/mkl/mkl_elementwise_copy-inl.h | 69 -- src/operator/mkl/mkl_elementwise_sum-inl.h | 117 --- src/operator/mkl/mkl_fully_connected-inl.h | 192 ---- src/operator/mkl/mkl_lrn-inl.h | 265 ----- src/operator/mkl/mkl_memory-inl.h | 137 --- src/operator/mkl/mkl_memory.cc | 291 ------ src/operator/mkl/mkl_memory.h | 123 --- src/operator/mkl/mkl_pooling-inl.h | 357 ------- src/operator/mkl/mkl_relu-inl.h | 272 ----- src/operator/mkl/mkl_util-inl.h | 110 -- 15 files changed, 4192 deletions(-) delete mode 100644 src/operator/mkl/mkl_batch_norm-inl.h delete mode 100644 src/operator/mkl/mkl_concat-inl.h delete mode 100644 src/operator/mkl/mkl_convolution-inl.h delete mode 100644 src/operator/mkl/mkl_cppwrapper.cc delete mode 100644 src/operator/mkl/mkl_cppwrapper.h delete mode 100644 src/operator/mkl/mkl_elementwise_copy-inl.h delete mode 100644 src/operator/mkl/mkl_elementwise_sum-inl.h delete mode 100644 src/operator/mkl/mkl_fully_connected-inl.h delete mode 100644 src/operator/mkl/mkl_lrn-inl.h delete mode 100644 src/operator/mkl/mkl_memory-inl.h delete mode 100644 src/operator/mkl/mkl_memory.cc delete mode 100644 src/operator/mkl/mkl_memory.h delete mode 100644 src/operator/mkl/mkl_pooling-inl.h delete mode 100644 src/operator/mkl/mkl_relu-inl.h delete mode 100644 src/operator/mkl/mkl_util-inl.h diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h deleted file mode 100644 index b5967f4de294..000000000000 --- a/src/operator/mkl/mkl_batch_norm-inl.h +++ /dev/null @@ -1,391 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_batch_norm-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLBatchNormOp : public Operator { - public: - explicit MKLBatchNormOp(BatchNormParam param) { - this->param_ = param; - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - scaleShift_space.dptr = NULL; - scaleShiftDiff_space.dptr = NULL; - } - virtual ~MKLBatchNormOp() { - if (batchNormFwdInference != NULL) dnnDelete(batchNormFwdInference); - if (batchNormFwdTraining != NULL) dnnDelete(batchNormFwdTraining); - if (batchNormBwdScaleShift != NULL) dnnDelete(batchNormBwdScaleShift); - dnnLayoutDelete(layout_usr_); - if (scaleShift_space.dptr) - Storage::Get()->Free(scaleShift_space); - if (scaleShiftDiff_space.dptr) - Storage::Get()->Free(scaleShiftDiff_space); - } - static std::string getName() { - return "MKLBatchNormOp"; - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - eps_ = param_.eps; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - - dnnError_t e; - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data->create_user_layout(dim, sizes, strides); - fwd_top_data->create_user_layout(dim, sizes, strides); - bwd_bottom_diff->create_user_layout(dim, sizes, strides); - bwd_top_diff->create_user_layout(dim, sizes, strides); - - // Primitives will be allocated during the first fwd pass - batchNormFwdInference = NULL; - batchNormFwdTraining = NULL; - batchNormBwdScaleShift = NULL; - int scaleShift_size = channels_*2*sizeof(DType); - scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - /*!use_weight_bias_*/ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = 1.0; - scaleShift_buf[channels_ + i] = 0; - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(aux_states.size(), 2); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(req.size(), 3); - } else { - CHECK_GE(out_data.size(), 1); - CHECK_GE(req.size(), 1); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0], - in_data[batchnorm::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[batchnorm::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - out = mkl_experimental_direct_get(out_data[batchnorm::kOut], s); - } - - // const real_t scale = static_cast(in_data[batchnorm::kData].shape_[1]) / - // static_cast(in_data[batchnorm::kData].shape_.Size()); - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor bias = in_data[batchnorm::kBeta].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) - slope = 1.f; - - dnnError_t e; - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - int bwd_flags = dnnUseScaleShift; - if (param_.use_global_stats) - bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance; -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - // Is it the first pass? Create a primitive. - if (batchNormFwdInference == NULL) { - std::shared_ptr bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_; - std::shared_ptr bottom_prv_desc = bottom_data_mem->get_prv_descriptor(); - CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_desc); - CHECK(mem_descr != NULL); - fwd_bottom_data = mem_descr; - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, mem_descr->layout_int, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_, - dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - } -#endif - if (NULL == bottom_data) { - if (batchNormFwdInference == NULL) { - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, layout_usr_, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = reinterpret_cast(data.dptr_); - } - - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - // use_weight_bias_ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = (slope.dptr_)[i]; - } - for (int i = 0; i < channels_; i++) { - scaleShift_buf[channels_ + i] = (bias.dptr_)[i]; - } - - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - - BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_, - fwd_top_data, out_data[batchnorm::kOut]); - if (ctx.is_train && !param_.use_global_stats) { - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo); - CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo); - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - e = dnnExecute(batchNormFwdTraining, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - e = dnnExecute(batchNormFwdInference, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } - -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(in_grad.size(), 3); - Stream *s = ctx.get_stream(); - Tensor data, grad, grad_in; - - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0], - out_grad[batchnorm::kOut].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - grad = mkl_experimental_direct_get_with_shape( - out_grad[batchnorm::kOut], dshape, s); - grad_in = mkl_experimental_direct_get_with_shape( - in_grad[batchnorm::kData], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - grad = mkl_experimental_direct_get(out_grad[batchnorm::kOut], s); - grad_in = mkl_experimental_direct_get(in_grad[batchnorm::kData], s); - } - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor gslope = in_grad[batchnorm::kGamma].get(s); - Tensor gbias = in_grad[batchnorm::kBeta].get(s); - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) slope = 1.f; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - if (NULL == bottom_data) - bottom_data = reinterpret_cast(data.dptr_); - - dnnError_t e; - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - if (ctx.is_train && !param_.use_global_stats) { - int size = mean.size(0); // Tensor - float * moving_mean_ptr = reinterpret_cast(moving_mean.dptr_); - float * mean_ptr = reinterpret_cast(mean.dptr_); - float * moving_var_ptr = reinterpret_cast(moving_var.dptr_); - float * var_ptr = reinterpret_cast(var.dptr_); - float minus_mom = (1 - param_.momentum); - for (int i = 0; i < size; i++) { - moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum - + mean_ptr[i] * minus_mom; - } - for (int i = 0; i < size; i++) { - moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum - + var_ptr[i] * minus_mom; - } - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - } - - - BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_, - bwd_bottom_diff, in_grad[batchnorm::kData]); - BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_, - true, out_grad[batchnorm::kOut]); - BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr; - e = dnnExecute(batchNormBwdScaleShift, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(grad_in.dptr_); - } -#endif - DType * scaleShiftDiff_buf = reinterpret_cast(scaleShiftDiff_space.dptr); - if (!param_.fix_gamma) { - // Store ScaleShift blobs - DType* diff_scale = gslope.dptr_; - for (int i = 0; i < channels_; i++) { - diff_scale[i] = scaleShiftDiff_buf[i]; - } - } else { - int gslope_size = gslope.size(0); - float * gslope_ptr = reinterpret_cast(gslope.dptr_); - for (int i = 0; i < gslope_size; i++) { - *gslope_ptr++ = 0.0f; - } - } - DType* diff_shift = gbias.dptr_; - for (int i = 0; i < channels_; i++) { - diff_shift[i] = scaleShiftDiff_buf[channels_ + i]; - } - } - - private: - BatchNormParam param_; - DType eps_; - bool use_weight_bias_; - - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_ = false; - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - dnnPrimitive_t batchNormFwdInference = NULL; - dnnPrimitive_t batchNormFwdTraining = NULL; - dnnPrimitive_t batchNormBwdScaleShift = NULL; - Storage::Handle scaleShift_space; - Storage::Handle scaleShiftDiff_space; - dnnLayout_t layout_usr_ = NULL; -}; // class BatchNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h deleted file mode 100644 index 1ed1e81d1303..000000000000 --- a/src/operator/mkl/mkl_concat-inl.h +++ /dev/null @@ -1,314 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_concat-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../channel_op_common.h" -#include "./mkl_util-inl.h" -namespace mxnet { -namespace op { - - -template -class MKLConcatOp : public Operator { - public: - static std::string getName() { - return "MKLConcatOp"; - } - explicit MKLConcatOp(ConcatParam param) - : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) { - concatFwd_ = static_cast(NULL); - concatBwd_ = static_cast(NULL); - fwd_top_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - - num_concats_ = param.num_args; - } - virtual ~MKLConcatOp() { - dnnDelete(concatFwd_); - dnnDelete(concatBwd_); - } - - private: - void LayerSetUp(const std::vector > &data, - const mshadow::Tensor &out, - size_t data_shape_size, size_t *split_channels_) { - size_t dim_src = data_shape_size; - size_t dim_dst = dim_src; - num_concats_ = size_; - channels_ = 0; - - for (size_t i = 1; i < num_concats_; ++i) { - for (size_t j = 1; j < data_shape_size; ++j) { - if (j == dimension_) continue; - CHECK_EQ(data[0].shape_[j], data[i].shape_[j]); - } - } - - for (size_t i = 0; i < num_concats_; ++i) { - CHECK_EQ((int)dim_src, data[i].shape_.kDimension); - - fwd_bottom_data_.push_back(MKLData::create()); - bwd_bottom_diff_.push_back(MKLData::create()); - fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]"; - bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]"; - - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[i].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - split_channels_[i] = data[i].shape_[1]; - channels_ += split_channels_[i]; - fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src); - bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; - } - size_t *sizes_dst = new size_t[dim_dst]; - size_t *strides_dst = new size_t[dim_dst]; - for (size_t d = 0; d < dim_dst; ++d) { - if (d == 2) - sizes_dst[d] = channels_; - else - sizes_dst[d] = data[0].shape_[dim_dst - 1 - d]; - strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1]; - } - bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst); - fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst); - delete[] sizes_dst; - delete[] strides_dst; - concatFwd_ = NULL; - concatBwd_ = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(static_cast(in_data.size()), size_); - CHECK_EQ(out_data.size(), 1); - CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > data(size_); - Tensor out; - if (in_data[0].ndim() == 2) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], 1, 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], 1, 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else if (in_data[0].ndim() == 3) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], in_data[i].shape_[2], 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], - out_data[concat_enum::kOut].shape_[2], 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else { - for (int i = 0; i < size_; ++i) { - data[i] = mkl_experimental_direct_get(in_data[i], s); - } - out = mkl_experimental_direct_get(out_data[concat_enum::kOut], s); - } - size_t *split_channels_ = new size_t[num_concats_]; - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, out, 4, split_channels_); - } - - dnnError_t e; - std::vector bottom_data; - bool isFirstPass = (concatFwd_ == NULL); - dnnLayout_t *layouts = NULL; - if (isFirstPass) { - layouts = new dnnLayout_t[num_concats_]; - } - - for (size_t i = 0; i < num_concats_; i++) { - void * bottom_i = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_i = mkl_prv_data(in_data[i]); - if (bottom_i != NULL) { - if (isFirstPass) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[i].Mkl_mem_); - fwd_bottom_data_[i] = mem_descr; - layouts[i] = mem_descr->layout_int; - } - } -#endif - if (bottom_i == NULL) { - bottom_i = data[i].dptr_; - if (isFirstPass) { - layouts[i] = fwd_bottom_data_[i]->layout_usr; - } - } - - bottom_data.push_back(reinterpret_cast(bottom_i)); - } - - if (isFirstPass) { - e = dnnConcatCreate(&concatFwd_, NULL, num_concats_, layouts); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst); - - e = dnnSplitCreate(&concatBwd_, NULL, num_concats_, - bwd_top_diff_->layout_int, split_channels_); - CHECK_EQ(e, E_SUCCESS); - - for (size_t n = 0; n < num_concats_; ++n) { - fwd_bottom_data_[n]->create_internal_layout(concatFwd_, - (dnnResourceType_t)(dnnResourceMultipleSrc + n)); - bwd_bottom_diff_[n]->create_internal_layout(concatBwd_, - (dnnResourceType_t)(dnnResourceMultipleDst + n)); - } - } - delete[] layouts; - - void *concat_res[dnnResourceNumber]; - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleSrc + i] - = reinterpret_cast(bottom_data[i]); - } - - concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_, - fwd_top_data_, out_data[concat_enum::kOut]); - e = dnnExecute(concatFwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - delete[] split_channels_; - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_grad.size(), static_cast(size_)); - Stream *s = ctx.get_stream(); - std::vector > grad_in(size_); - Tensor grad; - if (in_grad[0].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], 1, 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], 1, 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else if (in_grad[0].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], - out_grad[concat_enum::kOut].shape_[2], 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], in_grad[i].shape_[2], 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else { - grad = mkl_experimental_direct_get(out_grad[concat_enum::kOut], s); - for (int i = 0; i < size_; ++i) { - grad_in[i] = mkl_experimental_direct_get(in_grad[i], s); - } - } - - int need_bwd = 0; - for (size_t n = 0; n < num_concats_; n++) { - need_bwd += req[n]; - } - if (!need_bwd) { - return; - } - - dnnError_t e; - void *concat_res[dnnResourceNumber]; - concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true, - out_grad[concat_enum::kOut]); - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr( - grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]); - } - e = dnnExecute(concatBwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - int size_; - size_t dimension_; - - bool init_mkldnn_; - - dnnPrimitive_t concatFwd_; - dnnPrimitive_t concatBwd_; - std::shared_ptr > fwd_top_data_; - std::vector< std::shared_ptr > > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::vector< std::shared_ptr > > bwd_bottom_diff_; - - - size_t width_; - size_t height_; - size_t channels_; - size_t num_; - size_t num_concats_; -}; // class MKLConcatOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h deleted file mode 100644 index 813d061f172b..000000000000 --- a/src/operator/mkl/mkl_convolution-inl.h +++ /dev/null @@ -1,490 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_convolution-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../nn/convolution-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLConvolutionOp : public Operator { - public: - static std::string getName() { - return "MKLConvolutionOp"; - } - void SetupBuffer() { - convolutionBwdBias = static_cast(NULL); - convolutionBwdFilter = static_cast(NULL); - convolutionBwdData = static_cast(NULL); - convolutionFwd = static_cast(NULL); - fwd_bottom_data = MKLData::create(); - fwd_top_data = MKLData::create(); - fwd_filter_data = MKLData::create(); - fwd_bias_data = MKLData::create(); - bwdd_top_diff = MKLData::create(); - bwdd_bottom_diff = MKLData::create(); - bwdd_filter_data = MKLData::create(); - bwdf_top_diff = MKLData::create(); - bwdf_filter_diff = MKLData::create(); - bwdf_bottom_data = MKLData::create(); - bwdb_top_diff = MKLData::create(); - bwdb_bias_diff = MKLData::create(); - // Names are for debugging purposes only. - fwd_bottom_data->name = "fwd_bottom_data @ " + this->getName(); - fwd_top_data->name = "fwd_top_data @ " + this->getName(); - fwd_filter_data->name = "fwd_filter_data @ " + this->getName(); - fwd_bias_data->name = "fwd_bias_data @ " + this->getName(); - bwdd_top_diff->name = "bwdd_top_diff @ " + this->getName(); - bwdd_bottom_diff->name = "bwdd_bottom_diff @ " + this->getName(); - bwdd_filter_data->name = "bwdd_filter_data @ " + this->getName(); - bwdf_top_diff->name = "bwdf_top_diff @ " + this->getName(); - bwdf_bottom_data->name = "bwdf_bottom_data @ " + this->getName(); - bwdf_filter_diff->name = "bwdf_filter_diff @ " + this->getName(); - bwdb_top_diff->name = "bwdb_top_diff @ " + this->getName(); - bwdb_bias_diff->name = "bwdb_bias_diff @ " + this->getName(); - } - - explicit MKLConvolutionOp(ConvolutionParam p): - convolutionFwd(NULL), - convolutionBwdData(static_cast(NULL)), - convolutionBwdFilter(static_cast(NULL)), - convolutionBwdBias(static_cast(NULL)) { - this->param_ = p; - init_mkldnn_ = false; - // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(DType); - SetupBuffer(); - } - void ReleaseBuffer() { - if (convolutionFwd != NULL) { - dnnDelete(convolutionFwd); - convolutionFwd = NULL; - } - if (convolutionBwdData != NULL) { - dnnDelete(convolutionBwdData); - convolutionBwdData = NULL; - } - if (convolutionBwdFilter != NULL) { - dnnDelete(convolutionBwdFilter); - convolutionBwdFilter = NULL; - } - if (!param_.no_bias && convolutionBwdBias != NULL) { - dnnDelete(convolutionBwdBias); - convolutionBwdBias = NULL; - } - } - virtual ~MKLConvolutionOp() { - ReleaseBuffer(); - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - this->width_ = data.shape_[3]; - this->height_ = data.shape_[2]; - this->channels_ = data.shape_[1]; - this->num_ = data.shape_[0]; - this->group_ = param_.num_group; - this->width_out_ = out.shape_[3]; - this->height_out_ = out.shape_[2]; - int channel_out_ = out.shape_[1]; - this->num_output_ = channel_out_; - kernel_w_ = param_.kernel[1]; - kernel_h_ = param_.kernel[0]; - stride_w_ = param_.stride[1]; - stride_h_ = param_.stride[0]; - pad_w_ = param_.pad[1]; - pad_h_ = param_.pad[0]; - int status; - size_t n, g; - size_t iw, ih, ic; - size_t ow, oh, oc; - size_t kw, kh; - size_t dimension = 4; - g = std::max(this->group_, 1); - n = this->num_; - iw = this->width_; - ih = this->height_; - ic = this->channels_; - ow = this->width_out_; - oh = this->height_out_; - oc = this->num_output_; - kw = this->kernel_w_; - kh = this->kernel_h_; - oc = this->num_output_; - size_t bdata_sizes[4] = { iw, ih, ic, n }; - size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic }; - /* starting with MKL 2017 Gold in case of groups filter layout - * becomes 5D, i.e. groups become a separate dimension */ - size_t g_mkl2017 = g; - size_t f_dimension = dimension + (g != 1); - if (getMKLBuildDate() < 20160701) { - g_mkl2017 = 1; - f_dimension = dimension; - } - size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 }; - size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g }; - size_t bias_sizes[1] = { oc }; - size_t bias_strides[1] = { 1 }; - size_t tdata_sizes[4] = { ow, oh, oc, n }; - size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc }; - size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ }; - int inputOffset[2] = { -this->pad_w_, -this->pad_h_ }; - // Names are for debugging purposes only. - /*** convolution section ***/ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateForwardBias(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } else { - status = dnnGroupsConvolutionCreateForward(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } - CHECK_EQ(status, 0) - << "Failed dnnCreateConvolution(dnnForward) with status " - << status << "\n"; - fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension, - bdata_sizes, bdata_strides); - fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension, - tdata_sizes, tdata_strides); - fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - if (!param_.no_bias) - fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1, - bias_sizes, bias_strides); - /* - * Backward by data layer setup - */ - status = dnnGroupsConvolutionCreateBackwardData(&convolutionBwdData, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardData with status " - << status << "\n"; - bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc, - dimension, bdata_sizes, bdata_strides); - bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by filter layer setup - */ - status = dnnGroupsConvolutionCreateBackwardFilter(&convolutionBwdFilter, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardFilter with status " - << status << "\n"; - bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc, - dimension, bdata_sizes, bdata_strides); - bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by bias layer setup - */ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateBackwardBias(&convolutionBwdBias, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - tdata_sizes); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardBias with status " - << status << "\n"; - bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1, - bias_sizes, bias_strides); - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - DType *data_ptr = NULL; - DType *wmat_ptr = NULL; - DType *out_ptr = NULL; - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Tensor out = - mkl_experimental_direct_get(out_data[conv::kOut], s); - Tensor wmat = - mkl_experimental_direct_get(in_data[conv::kWeight], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(wmat.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - data_ptr = data.dptr_; - wmat_ptr = wmat.dptr_; - out_ptr = out.dptr_; - int status; - void *res_convolutionFwd[dnnResourceNumber]; - res_convolutionFwd[dnnResourceSrc] = - fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]); - res_convolutionFwd[dnnResourceFilter] = - fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]); - if (!param_.no_bias) { - Tensor bias = - mkl_experimental_direct_get(in_data[conv::kBias], s); - res_convolutionFwd[dnnResourceBias] = - fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]); - } - - res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr, - fwd_top_data, out_data[conv::kOut]); - status = dnnExecute(convolutionFwd, res_convolutionFwd); - CHECK_EQ(status, 0) << "Forward convolution failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out_ptr); - } -#endif - } - void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) { - int blob_byte_size = blob_size * sizeof(DType); - *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU()); - memcpy(pws->dptr, src, blob_byte_size); - } - void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) { - DType *dst = reinterpret_cast(dst_); - DType *src = reinterpret_cast(pws->dptr); -#pragma omp parallel for - for (int i = 0; i < blob_size; i++) { - dst[i] += src[i]; - } - if (pws->dptr) - Storage::Get()->Free(*pws); - pws->dptr = NULL; - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - if (param_.kernel.ndim() > 2) { - LOG(FATAL) << "Volume convolution is not implmented in mshadow"; - } - CHECK_EQ(out_grad.size(), 1); - size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); - Stream *s = ctx.get_stream(); - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Shape<3> wmat_shape = - Shape3(param_.num_group, - param_.num_filter / param_.num_group, - data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]); - Tensor wmat = - mkl_experimental_direct_get_with_shape( - in_data[conv::kWeight], wmat_shape, s); - Tensor grad = - mkl_experimental_direct_get(out_grad[conv::kOut], s); - Tensor gdata = - mkl_experimental_direct_get(in_grad[conv::kData], s); - Tensor gwmat = - mkl_experimental_direct_get_with_shape( - in_grad[conv::kWeight], wmat_shape, s); - - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, grad); - } - int status; - if (req[0]) { - void *res_convolutionBwdData[dnnResourceNumber]; - res_convolutionBwdData[dnnResourceDiffDst] = - bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdData[dnnResourceFilter] = - bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]); - Storage::Handle addtoWorkspace; - if (req[0] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace); - } - - res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_, - bwdd_bottom_diff, in_grad[conv::kData]); - status = dnnExecute(convolutionBwdData, res_convolutionBwdData); - CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } -#endif - if (req[0] == kAddTo) { - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size()); - } - } - if (req[1]) { - void *res_convolutionBwdFilter[dnnResourceNumber]; - - res_convolutionBwdFilter[dnnResourceDiffDst] = - bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdFilter[dnnResourceSrc] = - bwdf_bottom_data->get_converted_prv(data.dptr_, false, - in_data[conv::kData]); - Storage::Handle addtoWorkspace; - if (req[1] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace); - } - - res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr( - gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]); - status = dnnExecute(convolutionBwdFilter, res_convolutionBwdFilter); - CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } -#endif - if (req[1] == kAddTo) { - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size()); - } - } - if (!param_.no_bias) { - Tensor gbias = - mkl_experimental_direct_get(in_grad[conv::kBias], s); - void *res_convolutionBwdBias[dnnResourceNumber]; - res_convolutionBwdBias[dnnResourceDiffDst] = - bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_, - bwdb_bias_diff, in_grad[conv::kBias]); - status = dnnExecute(convolutionBwdBias, res_convolutionBwdBias); - CHECK_EQ(status, 0) << "Backward Bias failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdb_bias_diff->conversion_needed()) { - bwdb_bias_diff->convert_from_prv(gbias.dptr_); - } -#endif - } - } - - private: - ConvolutionParam param_; - size_t width_, - height_, - width_out_, - height_out_, - kernel_w_, - kernel_h_, - stride_w_, - stride_h_; - int group_, - num_, - num_output_; - size_t channels_; - int pad_w_, - pad_h_; - bool init_mkldnn_; - dnnPrimitive_t convolutionFwd; - dnnPrimitive_t convolutionBwdData; - dnnPrimitive_t convolutionBwdFilter; - dnnPrimitive_t convolutionBwdBias; - /* Fwd step */ - std::shared_ptr > fwd_bottom_data, fwd_top_data, fwd_filter_data, - fwd_bias_data; - /* Bwd data step */ - std::shared_ptr > bwdd_top_diff, bwdd_bottom_diff; - std::shared_ptr > bwdd_filter_data; - /* Bwd filter step */ - std::shared_ptr > bwdf_top_diff, bwdf_filter_diff; - std::shared_ptr > bwdf_bottom_data; - std::shared_ptr > bwdf_filter_diff_iter, bwdf2fwd_filter_diff, - bwdb_bias_diff_iter; - /* Bwd bias step */ - std::shared_ptr > bwdb_top_diff, bwdb_bias_diff; -}; // class ConvolutionOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc deleted file mode 100644 index 507e5498c85b..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.cc +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ - - - -#include "mkl_cppwrapper.h" -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_service.h" - -int getMKLBuildDate() { - static int build = 0; - if (build == 0) { - MKLVersion v; - mkl_get_version(&v); - build = atoi(v.Build); - printf("MKL Build:%d\n", build); - } - return build; -} - -bool enableMKLWarnGenerated() { - return false; -} -#endif // MSHADOW_USE_MKL2017 diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h deleted file mode 100644 index 7d66f20ad308..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.h +++ /dev/null @@ -1,1020 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ -#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ - - -#include -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_dnn_types.h" -#include "mkl_dnn.h" -#include "mkl_version.h" - - -extern int getMKLBuildDate(); -extern bool enableMKLWarnGenerated(); - - -template inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]); -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F32(pLayout, dimension, size, strides); -} -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F64(pLayout, dimension, size, strides); -} - -template inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type); -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type); -} -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type); -} - -template inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout); -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F32(layout); -} -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F64(layout); -} - -template inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2); -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F32(l1, l2); -} -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F64(l1, l2); -} - - -template inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout); -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F32(pPtr, layout); -} -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F64(pPtr, layout); -} - -template inline dnnError_t dnnReleaseBuffer( - void *ptr); -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F32(ptr); -} -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F64(ptr); -} - -template inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout); -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F32(layout); -} -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F64(layout); -} - -template inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F64(attributes); -} - - -template inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes); -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F64(attributes); -} - -template inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F32(primitive, attributes); -} -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F64(primitive, attributes); -} - -template inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F64(primitive, resources); -} - -template inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F64(primitive, resources); -} - -template inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F32(primitive); -} -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F64(primitive); -} - -template inline dnnError_t dnnDelete( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F32(primitive); -} -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F64(primitive); -} - - -template inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to); -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F32(pConversion, from, to); -} -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F64(pConversion, from, to); -} - - -template inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to); -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F32(conversion, from, to); -} -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F64(conversion, from, to); -} - - -template inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} - -template inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F32( - pRelu, - attributes, - dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F64( - pRelu, - attributes, - dataLayout, negativeSlope); -} - -template inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F32( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F64( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} - -template inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F32( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F64( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F32( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F64( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - - -template inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - -template inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]); -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F32( - pConcat, - attributes, - N, - src); -} -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F64( - pConcat, - attributes, - N, - src); -} - - -template inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]); -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F32( - pSplit, - attributes, - N, - src, - dst); -} -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F64( - pSplit, - attributes, - N, - src, - dst); -} - -template inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, Dtype *coefficients); -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, float *coefficients) { - return dnnSumCreate_F32( - pSum, - attributes, - nSummands, - layout, coefficients); -} -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, double *coefficients) { - return dnnSumCreate_F64( - pSum, - attributes, - nSummands, - layout, coefficients); -} - -template inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - - -template inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - -template inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - -template inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - - -template inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - -template inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]); - -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F32(pInnerProduct, - attributes, dimensions, - dstSize); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F64(pInnerProduct, - attributes, dimensions, - dstSize); -} -#endif // #MXNET_USE_MKL2017 == 1 -#endif // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h deleted file mode 100644 index 48c931291150..000000000000 --- a/src/operator/mkl/mkl_elementwise_copy-inl.h +++ /dev/null @@ -1,69 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { - -template -void MKLIdentityCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (!req[0]) return; -#if MKL_EXPERIMENTAL == 1 - if (op::mkl_prv_data(inputs[0])) { - std::shared_ptr in_data_mem = inputs[0].Mkl_mem_; - // User copy to avoid potential problem - std::shared_ptr > top_data = MKLData::create(); - std::shared_ptr top_mem = outputs[0].Mkl_mem_; - top_data->copy_from(in_data_mem); - top_mem->set_prv_descriptor(top_data); - return; - } -#endif - int in_blob_size = inputs[0].Size(); - int out_blob_size = outputs[0].Size(); - CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match "; - memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType)); -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h deleted file mode 100644 index d313fd15a5be..000000000000 --- a/src/operator/mkl/mkl_elementwise_sum-inl.h +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { -template -static void LayerSetUp(const std::vector > &data, - size_t data_shape_size, - std::shared_ptr > fwd_top_data) { - // Whether to use an asymptotically slower (for >2 inputs) but stabler method - // of computing the gradient for the PROD operation. (No effect for SUM op.) - // stable_prod_grad_ = 1; - size_t dim_src = data_shape_size; - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[0].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; -} - -template -void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[0] == kNullOp) return; - size_t size = in_data.size(); - Stream *s = ctx.get_stream(); - std::vector > data(size); - Tensor out = out_data[0].FlatTo1D(s); - bool in_place_flag = false; - int in_place_idx = 0; - - for (size_t i = 0; i < size; ++i) { - data[i] = in_data[i].FlatTo1D(s); - if (data[i].dptr_ == out.dptr_) { - in_place_idx = i; - in_place_flag = true; - } - } - std::shared_ptr > fwd_top_data = MKLData::create(); - std::vector coeffs_ = std::vector(data.size(), 1); - LayerSetUp(data, 1, fwd_top_data); - - - dnnError_t e; - void *eltwise_res[dnnResourceNumber]; - dnnPrimitive_t sumPrimitive = NULL; - e = dnnSumCreate(&sumPrimitive, NULL, size, fwd_top_data->layout_usr, - &coeffs_[0]); - CHECK_EQ(e, E_SUCCESS); - - eltwise_res[dnnResourceDst] = reinterpret_cast(const_cast(out.dptr_)); - eltwise_res[dnnResourceMultipleSrc] = - reinterpret_cast(reinterpret_cast(in_data[in_place_idx].dptr_)); - for (size_t i = 1; i < size; ++i) { - if (i == in_place_idx) continue; - eltwise_res[dnnResourceMultipleSrc + i] = - reinterpret_cast(reinterpret_cast(in_data[i].dptr_)); - } - - e = dnnExecute(sumPrimitive, eltwise_res); - CHECK_EQ(e, E_SUCCESS); - - if (sumPrimitive != NULL) { - dnnDelete(sumPrimitive); - sumPrimitive = NULL; - } -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h deleted file mode 100644 index 5e296704b6dd..000000000000 --- a/src/operator/mkl/mkl_fully_connected-inl.h +++ /dev/null @@ -1,192 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_fully_connected-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#include -#include -#include -#include "../activation-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLFullyConnectedOp : public Operator { - public: - explicit MKLFullyConnectedOp(const FullyConnectedParam& p, - const std::vector& in_shapes, - const std::vector& out_shapes): - param_(p) { - LayerSetUp(in_shapes, out_shapes); - } - - ~MKLFullyConnectedOp() { - dnnDelete(fullyConnectedFwd); - dnnDelete(fullyConnectedBwdData); - dnnDelete(fullyConnectedBwdFilter); - dnnDelete(fullyConnectedBwdBias); - } - static std::string getName() { - return "MKLFullyConnectedOp"; - } - - private: - void LayerSetUp(const std::vector& in_shapes, - const std::vector& out_shapes) { - const TShape& ishape = in_shapes[fullc::kData]; - - const size_t dim = 4; - const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]}; - const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]}; - const size_t output_channels = param_.num_hidden; - - dnnPrimitiveAttributes_t attributes = NULL; - MKLDNN_CALL(dnnPrimitiveAttributesCreate(&attributes)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateForwardBias( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } else { - MKLDNN_CALL(dnnInnerProductCreateForward( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } - MKLDNN_CALL(dnnInnerProductCreateBackwardData( - &fullyConnectedBwdData, - attributes, - dim, - src_sizes, - output_channels)); - MKLDNN_CALL(dnnInnerProductCreateBackwardFilter( - &fullyConnectedBwdFilter, - attributes, - dim, - src_sizes, - output_channels)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateBackwardBias( - &fullyConnectedBwdBias, - attributes, - 2, - dst_sizes)); - } - // TODO(minjie): Shouldn't `attributes` be destroyed? - } - - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor data; - Tensor out; - - Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1); - - Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1); - Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1); - - data = in_data[fullc::kData].get_with_shape(dshape, s); - out = out_data[fullc::kOut].get_with_shape(odshape, s); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDst] = - reinterpret_cast(out_data[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceBias] = reinterpret_cast(in_data[fullc::kBias].dptr_); - } - - MKLDNN_CALL(dnnExecute(fullyConnectedFwd, res_fullyConnected)); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - CHECK_EQ(out_grad.size(), 1); - const size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - - res_fullyConnected[dnnResourceDiffDst] = - reinterpret_cast(out_grad[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceDiffSrc] = - reinterpret_cast(in_grad[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDiffFilter] = - reinterpret_cast(in_grad[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceDiffBias] = - reinterpret_cast(in_grad[fullc::kBias].dptr_); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdFilter, res_fullyConnected)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnExecute(fullyConnectedBwdBias, res_fullyConnected)); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdData, res_fullyConnected)); - } - - private: - dnnPrimitive_t fullyConnectedFwd{nullptr}; - dnnPrimitive_t fullyConnectedBwdData{nullptr}; - dnnPrimitive_t fullyConnectedBwdFilter{nullptr}; - dnnPrimitive_t fullyConnectedBwdBias{nullptr}; - const FullyConnectedParam param_; -}; // class MKLFullyConnectedOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h deleted file mode 100644 index 90dfad50fa62..000000000000 --- a/src/operator/mkl/mkl_lrn-inl.h +++ /dev/null @@ -1,265 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_lrn-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLLRNOp : public Operator { - public: - static std::string getName() { - return "MKLLRNOp"; - } - - explicit MKLLRNOp(LRNParam param) : - lrnFwd(static_cast(NULL)), - lrnBwd(static_cast(NULL)), - lrn_buffer_(NULL) { - this->param_ = param; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - init_mkldnn_ = false; - } - - virtual ~MKLLRNOp() { - if (lrnFwd != NULL) { - dnnDelete(lrnFwd); - lrnFwd = NULL; - } - if (lrnBwd != NULL) { - dnnDelete(lrnBwd); - lrnBwd = NULL; - } - dnnReleaseBuffer(lrn_buffer_); - } - - private: - void LayerSetup(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_ = param_.nsize; - CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size"; - - alpha_ = param_.alpha; - beta_ = param_.beta; - k_ = param_.knorm; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - fwd_bottom_data_->name = "fwd_bottom_data_ @ " + getName(); - fwd_top_data_->name = "fwd_top_data_ @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff_ @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff_ @ " + getName(); - - fwd_bottom_data_->create_user_layout(dim, sizes, strides); - fwd_top_data_->create_user_layout(dim, sizes, strides); - bwd_bottom_diff_->create_user_layout(dim, sizes, strides); - bwd_top_diff_->create_user_layout(dim, sizes, strides); - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - Stream *s = ctx.get_stream(); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[lrn_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetup(data, out); - init_mkldnn_ = true; - } - - const void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[lrn_enum::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (lrnFwd == NULL) { - std::shared_ptr bottom_data_mem = - in_data[lrn_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data_ = mem_descr; - - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst); - bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc); - } - } -#endif - if (bottom_data == NULL) { - if (lrnFwd == NULL) { - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = data.dptr_; - } - - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceSrc] = const_cast(bottom_data); - - lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - e = dnnExecute(lrnFwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 2); - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[lrn_enum::kOut], s); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor grad_in = mkl_experimental_direct_get( - in_grad[lrn_enum::kData], s); - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceDiffDst] = - bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - lrn_res[dnnResourceSrc] = - fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]); - - lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]); - e = dnnExecute(lrnBwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - LRNParam param_; - int size_; - int pre_pad_; - DType alpha_; - DType beta_; - DType k_; - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_; - - private: - dnnPrimitive_t lrnFwd, lrnBwd; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - - DType *lrn_buffer_; -}; // class LocalResponseNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ - diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h deleted file mode 100644 index 71af10254b2a..000000000000 --- a/src/operator/mkl/mkl_memory-inl.h +++ /dev/null @@ -1,137 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ - - -#include -#include -#include -#include "mkl_cppwrapper.h" - -namespace mxnet { - -template -struct MKLMemoryDescriptorBase : public PrvMemDescr, - public std::enable_shared_from_this > { - MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL), - convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL), - name("UNKNOWN"), internal_ptr(NULL) {} - virtual ~MKLMemoryDescriptorBase() { - dnnLayoutDelete(layout_usr); - dnnLayoutDelete(layout_int); - if (internal_ptr != NULL) { - dnnReleaseBuffer(internal_ptr); - internal_ptr = NULL; - } - if (convert_to_int != NULL) { - dnnDelete(convert_to_int); - convert_to_int = NULL; - } - if (convert_from_int != NULL) { - dnnDelete(convert_from_int); - convert_from_int = NULL; - } - if (convert_prv2prv != NULL) { - dnnDelete(convert_prv2prv); - convert_prv2prv = NULL; - } - } - std::shared_ptr > get_shared_ptr() { - return this->shared_from_this(); - } - - dnnLayout_t layout_usr; - dnnLayout_t layout_int; - dnnPrimitive_t convert_to_int; - dnnPrimitive_t convert_from_int; - dnnPrimitive_t convert_prv2prv; - std::shared_ptr > descr_prv2prv_conversion; - - - std::string name; // for debugging purposes - void allocate() { - if (internal_ptr == NULL) { - int status = dnnAllocateBuffer( - reinterpret_cast(&internal_ptr), layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed internal_ptr memory allocation with status " - << status << "\n"; - } - } - virtual void* prv_ptr(bool allocate_when_uninit = true) { - if (internal_ptr == NULL && allocate_when_uninit) - allocate(); - return internal_ptr; - } - inline bool conversion_needed() { - return (convert_to_int != NULL); - } - void create_conversions(); - void create_internal_layout(const dnnPrimitive_t primitive, - dnnResourceType_t type); - void create_user_layout(size_t dimension, const size_t size[], - const size_t strides[]); - void create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]); - - virtual PrvDescrType get_descr_type() { - return PRV_DESCR_MKL2017; - } - virtual size_t prv_size() { - return dnnLayoutGetMemorySize(layout_int); - } - virtual size_t prv_count() { - return dnnLayoutGetMemorySize(layout_int) / sizeof(DType); - } - virtual void convert_from_prv(void* cpu_ptr); - virtual void convert_to_prv(void* cpu_ptr); - virtual bool layout_compare(std::shared_ptr other); - virtual void convert_from_other(std::shared_ptr other); - protected: - DType* internal_ptr; -}; - -template -struct MKLMemoryDescriptor : MKLMemoryDescriptorBase { - // The last get_converted_prv() argument is a hack for reusing - // in backward a conversion done already in the forward direction. - DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr, - const TBlob &blob); - void* get_output_ptr(DType *data_ptr, std::shared_ptr > self_ptr, - const TBlob &blob, bool in_place = false); - bool copy_from(std::shared_ptr dnn_chunk); - MKLMemoryDescriptor() {} -}; - -template struct MKLData : MKLMemoryDescriptor { - static std::shared_ptr > create() { - return std::make_shared >(); - } -}; - -template struct MKLData; -template struct MKLData; - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc deleted file mode 100644 index 7682fe1c1f37..000000000000 --- a/src/operator/mkl/mkl_memory.cc +++ /dev/null @@ -1,291 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#include "../operator_common.h" - -#if MXNET_USE_MKL2017 == 1 -#include -#include "mkl_memory-inl.h" -#include "mkl_util-inl.h" - -namespace mxnet { - -template -void MKLMemoryDescriptorBase::create_conversions() { - int status; - if (this->convert_from_int) { - status = dnnDelete(this->convert_from_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_from_int = NULL; - } - if (this->convert_to_int) { - status = dnnDelete(this->convert_to_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_to_int = NULL; - } - if (layout_int - && !dnnLayoutCompare(layout_usr, layout_int)) { - CHECK(layout_usr); - status = dnnConversionCreate(&convert_to_int, layout_usr, - layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_to_int with status " - << status << " for buffer: " << this->name << "\n"; - status = dnnConversionCreate(&convert_from_int, layout_int, - layout_usr); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_from_int with status " - << status << " for buffer: " << this->name << "\n"; - } -} - -template -void MKLMemoryDescriptorBase::create_internal_layout( - const dnnPrimitive_t primitive, dnnResourceType_t type) { - int status; - if (this->layout_int) { - status = dnnLayoutDelete(this->layout_int); - CHECK_EQ(status, E_SUCCESS); - } - status = dnnLayoutCreateFromPrimitive( - &this->layout_int, primitive, type); - CHECK_EQ(status, E_SUCCESS) - << "Failed dnnLayoutCreateFromPrimitive with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_usr) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_user_layout( - size_t dimension, const size_t size[], const size_t strides[]) { - int status; - if (this->layout_usr) { - status = dnnLayoutDelete(this->layout_usr); - CHECK_EQ(status, E_SUCCESS); - } - - status = dnnLayoutCreate( - &this->layout_usr, dimension, size, strides); - CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_int) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]) { - this->create_internal_layout(primitive, type); - this->create_user_layout(dimension, size, strides); -} - - -template -void MKLMemoryDescriptorBase::convert_from_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_from_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = this->prv_ptr(); - convert_resources[dnnResourceTo] = cpu_ptr; - status = dnnExecute(this->convert_from_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - -template -void MKLMemoryDescriptorBase::convert_to_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_to_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - - -template -bool MKLMemoryDescriptorBase::layout_compare( - std::shared_ptr other) { - CHECK_EQ(other->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr >other_descr = - std::static_pointer_cast > - (other); - - if (dnnLayoutCompare(other_descr->layout_int, - this->layout_int)) - return true; - else - return false; -} - -template -void MKLMemoryDescriptorBase::convert_from_other( - std::shared_ptr other) { - std::shared_ptr > other_descr = - std::static_pointer_cast > - (other); - - int status; - dnnPrimitive_t convert; - status = dnnConversionCreate(&convert, - other_descr->layout_int, this->layout_int); - - void *convert_resources[dnnResourceNumber]; - convert_resources[dnnResourceFrom] = other_descr->prv_ptr(); - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(convert, convert_resources); - CHECK_EQ(status, 0) << "Conversion from other failed with status " - << status; - - dnnDelete(convert); -} - - -template -Dtype* MKLMemoryDescriptor::get_converted_prv( - Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) { - Dtype* prv_ptr = NULL; - std::shared_ptr dnn_chunk = NULL; -#if MKL_EXPERIMENTAL == 1 - dnn_chunk = blob.Mkl_mem_; -#endif -#if MKL_EXPERIMENTAL == 1 - if (dnn_chunk != NULL) - prv_ptr = static_cast(dnn_chunk->prv_data()); -#endif - - if (this->convert_to_int != NULL) { -#if MKL_EXPERIMENTAL == 1 - int status; - void *convert_resources[dnnResourceNumber]; -#endif - if (prv_ptr == NULL) { - this->allocate(); - this->convert_to_prv(cpu_ptr); -#if MKL_EXPERIMENTAL == 1 - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } -#endif - return this->internal_ptr; - } -#if MKL_EXPERIMENTAL == 1 - if (prv_ptr != NULL) { - std::shared_ptr > current_descr = - op::mkl_get_mem_desc(dnn_chunk); - if (!dnnLayoutCompare(current_descr->layout_int, - this->layout_int)) { - if (this->convert_prv2prv) { - CHECK_EQ(dnnLayoutCompare( - this->descr_prv2prv_conversion->layout_int, - this->layout_int), 0); - status = 0; - } else { - status = dnnConversionCreate(&this->convert_prv2prv, - current_descr->layout_int, this->layout_int); - if (status == 0) - this->descr_prv2prv_conversion = current_descr; - } - if (status != 0) { - this->allocate(); - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } else { - this->allocate(); - convert_resources[dnnResourceFrom] = reinterpret_cast(prv_ptr); - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_prv2prv, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } - return this->internal_ptr; - } else if (current_descr.get() != this) { - // MKL_DLOG(INFO) << "layout OK " - // << current_descr->name << " == " << this->name; - } - } -#endif - return const_cast(prv_ptr); - } else { - if (prv_ptr != NULL) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(cpu_ptr); -#endif - // printf("get_converted_prv release %s\n", other_descr->name.c_str()); - } - } - return cpu_ptr; -} - -template -void* MKLMemoryDescriptor::get_output_ptr(Dtype *data_ptr, - std::shared_ptr > self_ptr, const TBlob &blob, bool in_place) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr dnn_chunk = blob.Mkl_mem_; -#endif - if (this->conversion_needed()) { - void * prv_ptr = this->prv_ptr(); -#if MKL_EXPERIMENTAL == 1 - if (!in_place) { - dnn_chunk->set_prv_descriptor(self_ptr); - } else { - Dtype * blob_prv = op::mkl_prv_data(blob); - if (blob_prv != NULL) - return blob_prv; - } -#endif - return prv_ptr; - } else { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(data_ptr); -#endif - return data_ptr; - } -} - -template class MKLMemoryDescriptor; -template class MKLMemoryDescriptor; - -template class MKLMemoryDescriptorBase; -template class MKLMemoryDescriptorBase; -} // namespace mxnet -#endif diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h deleted file mode 100644 index 13f1fd27b12b..000000000000 --- a/src/operator/mkl/mkl_memory.h +++ /dev/null @@ -1,123 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_ - -#include -#include -#include - - -namespace mxnet { -// Base class -struct PrvMemDescr { - virtual void convert_from_prv(void* cpu_ptr) = 0; - virtual void convert_to_prv(void* cpu_ptr) = 0; - virtual void convert_from_other(std::shared_ptr other) = 0; - virtual void* prv_ptr(bool allocate_when_uninit = true) = 0; - // returns true for matching layouts - virtual bool layout_compare(std::shared_ptr other) = 0; - virtual size_t prv_count() = 0; - virtual size_t prv_size() = 0; - // This might help using prv_ptr_ by different accelerators/engines - enum PrvDescrType { - PRV_DESCR_MKL2017, - PRV_DESCR_MKLDNN - }; - virtual PrvDescrType get_descr_type() = 0; -}; - -#if MKL_EXPERIMENTAL == 1 -// Currently HEAD_AT_PRV do not free CPU data -enum SyncedHead { - HEAD_AT_CPU, - HEAD_AT_PRV, -}; -struct MKLMemHolder { - SyncedHead head_; - std::shared_ptr prv_descriptor_; - bool b_disable_prv_2_cpu; - bool b_eager_mode; - void disable_prv_2_cpu(bool flag) { - b_disable_prv_2_cpu = flag; - } - void set_eager_mode(bool eager_mode) { - b_eager_mode = eager_mode; - } - void set_prv_descriptor(std::shared_ptr descriptor, bool same_data = false) { - head_ = HEAD_AT_PRV; - prv_descriptor_ = descriptor; - } - std::shared_ptr get_prv_descriptor() { - return prv_descriptor_; - } - bool head_at_prv() { - return (head_ == HEAD_AT_PRV) ? true : false; - } - void* prv_data(bool allocate_when_uninit = true) { - if (head_ != HEAD_AT_PRV) { - return NULL; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return reinterpret_cast(prv_descriptor_->prv_ptr(allocate_when_uninit)); - } - - int prv_count() { - if (head_ != HEAD_AT_PRV) { - return 0; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return prv_descriptor_->prv_count(); - } - static std::shared_ptr create() { - return std::make_shared(); - } - void check_and_prv_to_cpu(void *dptr_) { - if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) { - CHECK(prv_descriptor_ != nullptr); - prv_descriptor_->convert_from_prv(dptr_); - // Because operator use CPU & maybe change it, change to CPU Flag - head_ = HEAD_AT_CPU; - } - if (b_disable_prv_2_cpu) { - b_disable_prv_2_cpu = false; - } - } - MKLMemHolder() : - head_(HEAD_AT_CPU), prv_descriptor_(nullptr), - b_disable_prv_2_cpu(false), b_eager_mode(false) {} -}; -#else -struct MKLMemHolder { - public: - virtual std::shared_ptr get_prv_descriptor() = 0; -}; -#endif - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_H_ diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h deleted file mode 100644 index 5662a61aebd3..000000000000 --- a/src/operator/mkl/mkl_pooling-inl.h +++ /dev/null @@ -1,357 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_pooling-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ - -#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#include -#include -#include -#include "../operator_common.h" -#include "../nn/pooling-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - - -template -class MKLPoolingOp : public Operator { - public: - static std::string getName() { - return "MKLPoolingOp"; - } - explicit MKLPoolingOp(PoolingParam p) { - poolingFwd = static_cast(NULL); - poolingBwd = static_cast(NULL); - max_idx_data = static_cast(NULL); - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - this->param_ = p; - init_mkldnn_ = false; - } - virtual ~MKLPoolingOp() { - if (poolingFwd != NULL) { - dnnDelete(poolingFwd); - poolingFwd = NULL; - } - if (poolingBwd != NULL) { - dnnDelete(poolingBwd); - poolingBwd = NULL; - } - if (max_idx_data != NULL) { - dnnReleaseBuffer(max_idx_data); - max_idx_data = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - global_pooling_ = param_.global_pool; - if (global_pooling_) { - kernel_h_ = height_; - kernel_w_ = width_; - } else { - kernel_h_ = param_.kernel[0]; - kernel_w_ = param_.kernel[1]; - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - pad_h_ = param_.pad[0]; - pad_w_ = param_.pad[1]; - if (global_pooling_) { - stride_h_ = stride_w_ = 1; - } else { - stride_h_ = param_.stride[0]; - stride_w_ = param_.stride[1]; - } - if (global_pooling_) { - CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(param_.pool_type == pool_enum::kAvgPooling - || param_.pool_type == pool_enum::kMaxPooling) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_h_, kernel_h_); - CHECK_LT(pad_w_, kernel_w_); - } - pooled_height_ = out.shape_[2]; - pooled_width_ = out.shape_[3]; - - size_t dim = 4; - size_t src_sizes[4], src_strides[4]; - size_t dst_sizes[4], dst_strides[4]; - src_sizes[0] = width_; - src_sizes[1] = height_; - src_sizes[2] = channels_; - src_sizes[3] = num_; - src_strides[0] = 1; - src_strides[1] = src_sizes[0]; - src_strides[2] = src_sizes[0] * src_sizes[1]; - src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2]; - dst_sizes[0] = pooled_width_; - dst_sizes[1] = pooled_height_; - dst_sizes[2] = src_sizes[2]; - dst_sizes[3] = src_sizes[3]; - dst_strides[0] = 1; - dst_strides[1] = dst_sizes[0]; - dst_strides[2] = dst_sizes[0] * dst_sizes[1]; - dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2]; - src_offset[0] = -pad_w_; - src_offset[1] = -pad_h_; - src_offset[2] = -pad_w_; - src_offset[3] = -pad_h_; - kernel_stride[0] = stride_w_; - kernel_stride[1] = stride_h_; - kernel_size[0] = kernel_w_; - kernel_size[1] = kernel_h_; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - - fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides); - fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides); - bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides); - bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides); - - // Primitives will be allocated during the first fwd pass - poolingFwd = NULL; - poolingBwd = NULL; - max_idx_data = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Tensor data = mkl_experimental_direct_get( - in_data[pool_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[pool_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - auto first_pass = false; - if (poolingFwd == NULL) first_pass = true; - - dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax; - - switch (param_.pool_type) { - case pool_enum::kMaxPooling: - algorithm = dnnAlgorithmPoolingMax; - break; - case pool_enum::kAvgPooling: - algorithm = dnnAlgorithmPoolingAvgIncludePadding; - - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - - dnnError_t status; - void* pooling_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[pool_enum::kData])); -#endif - dnnBorder_t border_type = dnnBorderZerosAsymm; - switch (param_.pooling_convention) { - case pool_enum::kFull: - border_type = dnnBorderZeros; - break; - case pool_enum::kValid: - border_type = dnnBorderZerosAsymm; - break; - default: - border_type = dnnBorderZerosAsymm; - break; - } - if (NULL == bottom_data) { - bottom_data = data.dptr_; - if (NULL == poolingFwd) { - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - } - } -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (NULL == poolingFwd) { - std::shared_ptr bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data = mem_descr; - - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc); - } - } -#endif - - if (first_pass) { - dnnLayout_t max_idx_datal = NULL; - status = dnnLayoutCreateFromPrimitive( - &max_idx_datal, poolingFwd, dnnResourceWorkspace); - CHECK_EQ(status, E_SUCCESS); - status = dnnAllocateBuffer(reinterpret_cast(&max_idx_data), max_idx_datal); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst); - bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc); -#endif - dnnLayoutDelete(max_idx_datal); - first_pass = false; - } - pooling_res[dnnResourceSrc] = bottom_data; - pooling_res[dnnResourceWorkspace] = max_idx_data; - - pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr( - out.dptr_, fwd_top_data, out_data[pool_enum::kOut]); - status = dnnExecute(poolingFwd, pooling_res); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - CHECK_EQ(req.size(), 1); - CHECK_EQ(in_grad.size(), 1); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[pool_enum::kOut], s); - Tensor input_grad = mkl_experimental_direct_get( - in_grad[pool_enum::kData], s); - dnnError_t e; - void* pooling_res[dnnResourceNumber]; - pooling_res[dnnResourceWorkspace] = reinterpret_cast(max_idx_data); - - pooling_res[dnnResourceDiffDst] = - bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]); - - pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr( - input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]); - e = dnnExecute(poolingBwd, pooling_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(input_grad.dptr_); - } -#endif - } - - private: - PoolingParam param_; - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_, num_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - - private: - size_t kernel_size[2], - kernel_stride[4]; - int src_offset[4]; // 2*(dimension-2) - dnnPrimitive_t poolingFwd, poolingBwd; - DType *max_idx_data; - - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - bool init_mkldnn_; -}; // class MKLPoolingOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h deleted file mode 100644 index 8d7ab5e1e2db..000000000000 --- a/src/operator/mkl/mkl_relu-inl.h +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_relu-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLReluOp : public Operator { - public: - static std::string getName() { - return "MKLReluOp"; - } - MKLReluOp(): - reluFwd_(NULL), - reluBwd_(NULL) { - init_mkldnn_ = false; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - } - - ~MKLReluOp() { - if (reluFwd_ != NULL) { - dnnDelete(reluFwd_); - reluFwd_ = NULL; - } - if (reluBwd_ != NULL) { - dnnDelete(reluBwd_); - reluBwd_ = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_t dim = 4; - size_t *sizes = new size_t[dim]; - size_t *strides = new size_t[dim]; - for (size_t d = 0; d < dim; ++d) { - (sizes)[d] = data.shape_[dim - 1 - d]; - (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1]; - } - // Names are for debugging only - fwd_bottom_data_->name = "fwd_bottom_data @ " + getName(); - fwd_top_data_->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff @ " + getName(); - fwd_bottom_data_->create_user_layout(dim, (sizes), (strides)); - fwd_top_data_->create_user_layout(dim, (sizes), (strides)); - bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides)); - bwd_top_diff_->create_user_layout(dim, (sizes), (strides)); - delete[] sizes; - delete[] strides; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[activation::kData].ndim() == 1) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 3) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], - in_data[activation::kData].shape_[2], 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[activation::kData], s); - out = mkl_experimental_direct_get(out_data[activation::kOut], s); - } - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[activation::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (bottom_data != NULL) { - if (reluFwd_ == NULL) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[activation::kData].Mkl_mem_); - DType negative_slope = 0; - dnnError_t e; - e = dnnReLUCreateForward(&reluFwd_, NULL, mem_descr->layout_int, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, mem_descr->layout_int, - mem_descr->layout_int, negative_slope); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data_ = mem_descr; - fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc); - } - } -#endif - if (bottom_data == NULL) { - bottom_data = data.dptr_; - if (reluFwd_ == NULL) { - dnnError_t e; - DType negative_slope = 0; - e = dnnReLUCreateForward(&reluFwd_, NULL, - fwd_bottom_data_->layout_usr, negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - } - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - relu_res[dnnResourceSrc] = bottom_data; - - relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_)); - e = dnnExecute(reluFwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data_->conversion_needed()) { - fwd_top_data_->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1); - Stream *s = ctx.get_stream(); - Tensor m_out_grad; - Tensor m_out_data; - Tensor m_in_grad; - - if (out_grad[activation::kOut].ndim() == 1) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], - out_grad[activation::kOut].shape_[2], 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else { - m_out_grad = mkl_experimental_direct_get(out_grad[activation::kOut], s); - m_out_data = mkl_experimental_direct_get(out_data[activation::kOut], s); - m_in_grad = mkl_experimental_direct_get(in_grad[activation::kData], s); - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(out_data[activation::kOut])); -#endif - if (NULL == bottom_data) { - bottom_data = reinterpret_cast(const_cast(m_out_data.dptr_)); - } - relu_res[dnnResourceSrc] = bottom_data; - relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_, - true, out_grad[activation::kOut]); - relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]); - e = dnnExecute(reluBwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff_->conversion_needed()) { - bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_); - } -#endif - } - - private: - bool init_mkldnn_; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - dnnPrimitive_t reluFwd_, reluBwd_; -}; // class MKLReluOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h deleted file mode 100644 index 4ad786a2ce93..000000000000 --- a/src/operator/mkl/mkl_util-inl.h +++ /dev/null @@ -1,110 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_util-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#include -#define MKLDNN_CALL(func) \ - { \ - dnnError_t status = (func); \ - CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ")."; \ - } - - -namespace mxnet { -namespace op { - -#if MKL_EXPERIMENTAL == 1 - template - inline DType * mkl_prv_data(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return reinterpret_cast(bottom_data_mem->prv_data()); - } - return NULL; - } - - template - inline int mkl_prv_count(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return bottom_data_mem->prv_count(); - } - return 0; - } -#endif - inline void mkl_set_priv_flag(const TBlob &b) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - bottom_data_mem->disable_prv_2_cpu(true); - } -#endif - } -#if MKL_EXPERIMENTAL == 1 - template - inline std::shared_ptr > mkl_get_mem_desc( - const std::shared_ptr data_mem) { - std::shared_ptr prv_descriptor = - data_mem->get_prv_descriptor(); - CHECK_EQ(prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast> - (prv_descriptor); - CHECK(mem_descr != NULL); - return mem_descr; - } -#endif - template - inline mshadow::Tensor mkl_experimental_direct_get( - const TBlob &b, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get(s); - } - template - inline mshadow::Tensor mkl_experimental_direct_get_with_shape( - const TBlob &b, const mshadow::Shape &shape, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get_with_shape(shape, s); - } -} // namespace op -#if MKL_EXPERIMENTAL == 1 -inline void mkl_tblobs_prv_to_cpu(const std::vector &data) { - for (size_t i = 0; i < data.size(); i++) { - std::shared_ptr mem_holder = data[i].Mkl_mem_; - if (mem_holder != nullptr && mem_holder->b_eager_mode) { - mem_holder->check_and_prv_to_cpu(data[i].dptr_); - } - } -} -inline void mkl_set_tblob_eager_mode(const TBlob &data) { - std::shared_ptr mem_holder = data.Mkl_mem_; - if (mem_holder != nullptr) { - mem_holder->set_eager_mode(true); - } -} -#endif -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ From caa3bf374ce3f455c0705865953bbcd6877cde7b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 20 Oct 2017 17:29:15 -0700 Subject: [PATCH 042/264] Update MXNet for MKLDNN. --- include/mxnet/ndarray.h | 65 ++++++++++----------- include/mxnet/tensor_blob.h | 29 ---------- src/executor/attach_op_execs_pass.cc | 14 +---- src/kvstore/kvstore_dist.h | 20 ------- src/ndarray/ndarray.cc | 87 ++++++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 97 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 8398b7bf7291..1748e1ec5d46 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -35,12 +35,12 @@ #include #include #include +#if MXNET_USE_MKLDNN == 1 +#include +#endif #include "./base.h" #include "./storage.h" #include "./engine.h" -#if MKL_EXPERIMENTAL == 1 -#include -#endif // check c++11 #if DMLC_USE_CXX11 == 0 #error "cxx11 was required for ndarray module" @@ -61,6 +61,9 @@ enum NDArrayStorageType { kDefaultStorage, // dense kRowSparseStorage, // row sparse kCSRStorage, // csr +#if MXNET_USE_MKLDNN == 1 + kMKLDNNStorage, // MKLDNN +#endif }; enum NDArrayFormatErr { @@ -72,6 +75,7 @@ enum NDArrayFormatErr { kRSPIdxErr, // indices error for row sparse }; +class MKLDNNMemory; /*! * \brief ndarray interface @@ -80,9 +84,6 @@ class NDArray { public: /*! \brief default constructor */ NDArray() { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = MKLMemHolder::create(); -#endif } /*! * \brief constructs a new dynamic NDArray @@ -96,9 +97,6 @@ class NDArray { : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! \brief constructor for NDArray with storage type */ @@ -142,9 +140,6 @@ class NDArray { } ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, dtype, aux_types, aux_shapes); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! * \brief constructing a static NDArray that shares data with TBlob @@ -157,17 +152,11 @@ class NDArray { : ptr_(std::make_shared(data, dev_id)), shape_(data.shape_), dtype_(data.type_flag_), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! \brief create ndarray from shared memory */ NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype) : ptr_(std::make_shared(shared_pid, shared_id, shape, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! @@ -184,9 +173,6 @@ class NDArray { const TBlob &data, const std::vector &aux_data, int dev_id) : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } @@ -271,9 +257,6 @@ class NDArray { << "Unexpected storage type: " << stype; res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); }); -#if MKL_EXPERIMENTAL == 1 - res.Mkl_mem_ = Mkl_mem_; -#endif return res; } /*! @@ -531,12 +514,6 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - // convert prv to cpu - Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr); - } -#endif NDArray ret = *this; ret.shape_ = shape; ret.dtype_ = dtype; @@ -608,6 +585,21 @@ class NDArray { << "CheckAndAllocAuxData is not intended for kDefaultStorage"; ptr_->CheckAndAllocAuxData(i, aux_shape); } + +#if MXNET_USE_MKLDNN == 1 + std::shared_ptr GetMKLDNNData() const; + std::shared_ptr GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc, + std::vector &net) const; + std::shared_ptr GetMKLDNNData(); + std::shared_ptr GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc, + std::vector &net); + + std::shared_ptr CreateMKLDNNData( + const mkldnn::memory::primitive_desc &desc); +#endif + /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -874,6 +866,11 @@ class NDArray { } }; // struct Chunk +#if MXNET_USE_MKLDNN == 1 + // Have MKL memory reference to the data in TBlob. + void SetMKLMem(); +#endif + void SetTBlob() const { CHECK(ptr_ != nullptr); TShape shape = shape_; @@ -882,6 +879,7 @@ class NDArray { if (stype == kDefaultStorage) { dptr += byte_offset_; } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + CHECK_NE(byte_offset_, 0); shape = storage_shape(); } else { LOG(FATAL) << "unknown storage type " << stype; @@ -890,13 +888,10 @@ class NDArray { tblob_.shape_ = shape; tblob_.type_flag_ = dtype_; tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); -#if MKL_EXPERIMENTAL == 1 - tblob_.Mkl_mem_ = Mkl_mem_; -#endif } -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; +#if MXNET_USE_MKLDNN == 1 + std::shared_ptr Mkl_mem_; #endif /*! \brief internal data of NDArray */ std::shared_ptr ptr_{nullptr}; diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h index b65cd2b434e4..168ddcca24b7 100755 --- a/include/mxnet/tensor_blob.h +++ b/include/mxnet/tensor_blob.h @@ -36,9 +36,6 @@ #include #include #include "./base.h" -#if MXNET_USE_MKL2017 == 1 -#include -#endif namespace mxnet { /* Forward declaration for friend declaration in TBlob */ @@ -66,17 +63,10 @@ class TBlob { /*! \brief type flag of the tensor blob */ int type_flag_; - /*! \brief storing mkl chunk buffer blob, use for experimental only */ -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief default constructor, default copy assign will work */ TBlob(void) : dptr_(NULL), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(cpu::kDevMask, 0); } /*! @@ -90,9 +80,6 @@ class TBlob { TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -105,9 +92,6 @@ class TBlob { */ TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(type_flag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -135,9 +119,6 @@ class TBlob { shape_ = src.shape_; type_flag_ = mshadow::DataType::kFlag; SetDLTensor(Device::kDevMask, -1); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif return *this; } /*! @@ -172,11 +153,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return mshadow::Tensor(static_cast(dptr_), shape_.FlatTo2D(), shape_[shape_.ndim() - 1], @@ -217,11 +193,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return static_cast(dptr_); } /*! \brief device mask of the corresponding device */ diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 1bcc40a894dd..6bcfd6fcf614 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -30,11 +30,7 @@ #include "../common/utils.h" #include "../common/exec_utils.h" #include "./exec_pass.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif + namespace mxnet { namespace op { @@ -106,10 +102,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { @@ -175,10 +167,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index b00d0de935f7..d0a968154afb 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -32,11 +32,6 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" -#if MKL_EXPERIMENTAL == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif namespace mxnet { namespace kvstore { @@ -228,9 +223,6 @@ class KVStoreDist : public KVStoreLocal { PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); @@ -380,9 +372,6 @@ class KVStoreDist : public KVStoreLocal { [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { size_t size = small_buf.shape().Size(); real_t* data = small_buf.data().dptr(); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(small_buf.data()); -#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -407,9 +396,6 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = send_buf.shape().Size(); real_t* data = send_buf.data().dptr(); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -431,9 +417,6 @@ class KVStoreDist : public KVStoreLocal { using namespace rowsparse; auto push_to_servers = [this, key, send_buf] (RunContext rctx, Engine::CallbackOnComplete cb) { -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif real_t* data = send_buf.data().dptr(); const int64_t num_rows = send_buf.aux_shape(kIdx)[0]; const auto offsets = send_buf.aux_data(kIdx).dptr(); @@ -472,9 +455,6 @@ class KVStoreDist : public KVStoreLocal { // allocate memory for the buffer size_t num_rows = indices.shape().Size(); recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); const auto offsets = indices.data().dptr(); const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f09f168977ab..c87b0be3e0c7 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -181,6 +181,93 @@ void NDArray::set_fresh_out_grad(bool state) const { info.fresh_out_grad = state; } +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch(dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + default: + return mkldnn::memory::data_type::data_undef; + } +} + +#if MXNET_USE_MKLDNN == 1 +void NDArray::SetMKLMem() { + if (Mkl_mem_ || storage_type() != kDefaultStorage) + return; + + mkldnn::memory::dims dims(shape_.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = shape_[i]; + mkldnn::memory::desc data_md({dims}, get_mkldnn_type(dtype_), + // TODO is this the right layout? + mkldnn::memory::format::nchw); + // TODO do I specify the right CPU index? + auto cpu_engine = mkldnn::engine(mkldnn::engine::cpu, 0); + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + cpu_engine), data().dptr_)); +} + +std::shared_ptr NDArray::GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc, + std::vector &net) const { + const_cast(this)->SetMKLMem(); + if (Mkl_mem_ && Mkl_mem_->get_primitive_desc() == desc) + return Mkl_mem_; + else if (Mkl_mem_) { + // TODO we should manage the memory allocation here. + std::shared_ptr ret(new mkldnn::memory(desc)); + net.push_back(mkldnn::reorder(*Mkl_mem_, *ret)); + return ret; + } + else + // TODO We don't support converting sparse format. + return nullptr; +} + +std::shared_ptr NDArray::GetMKLDNNData() const { + const_cast(this)->SetMKLMem(); + if (Mkl_mem_) + return Mkl_mem_; + else + // TODO We don't support converting sparse format. + return nullptr; +} + +std::shared_ptr NDArray::GetMKLDNNData() { + SetMKLMem(); + if (Mkl_mem_) + return Mkl_mem_; + else + // TODO We don't support converting sparse format. + return nullptr; +} + +std::shared_ptr NDArray::GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc, + std::vector &net) { + SetMKLMem(); + if (Mkl_mem_ && Mkl_mem_->get_primitive_desc() == desc) + return Mkl_mem_; + else if (Mkl_mem_) { + // TODO we should manage the memory allocation here. + std::shared_ptr ret(new mkldnn::memory(desc)); + net.push_back(mkldnn::reorder(*Mkl_mem_, *ret)); + return ret; + } + else + // TODO We don't support converting sparse format. + return nullptr; +} + +std::shared_ptr NDArray::CreateMKLDNNData( + const mkldnn::memory::primitive_desc &desc) { + CHECK(Mkl_mem_ == nullptr); + CHECK(storage_type() == kMKLDNNStorage); + // TODO we should manage the memory allocation here. + Mkl_mem_.reset(new mkldnn::memory(desc)); + return Mkl_mem_; +} +#endif /*! * \brief run a ternary operation From db10bb1761908c37b198697d7d4e191a62d07373 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 24 Oct 2017 10:40:45 -0700 Subject: [PATCH 043/264] Enable MKLDNN Relu. --- src/operator/nn/activation-inl.h | 75 +++++++++------ src/operator/nn/activation.cc | 62 ++++++++++-- src/operator/nn/mkldnn/mkldnn_relu-inl.h | 117 +++++++++++++++++++++++ 3 files changed, 217 insertions(+), 37 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_relu-inl.h diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index d8da30b7263a..f297d92cf598 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -100,31 +100,25 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, } template -void ActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { +void _ActivationCompute(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &input, OpReqType req, const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, { switch (param.act_type) { case activation::kReLU: ActivationForward( - ctx, inputs[0], req[0], outputs[0]); + ctx, input, req, output); break; case activation::kSigmoid: ActivationForward( - ctx, inputs[0], req[0], outputs[0]); + ctx, input, req, output); break; case activation::kTanh: ActivationForward( - ctx, inputs[0], req[0], outputs[0]); + ctx, input, req, output); break; case activation::kSoftReLU: ActivationForward( - ctx, inputs[0], req[0], outputs[0]); + ctx, input, req, output); break; default: LOG(FATAL) << "unknown activation type"; @@ -133,36 +127,26 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs, } template -void ActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { -#if MXNET_USE_CUDNN == 1 - CHECK_EQ(inputs.size(), 3U); -#else - CHECK_EQ(inputs.size(), 2U); -#endif - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { +void _ActivationGradCompute(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &out_grad, const TBlob &out_data, + OpReqType req, const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { switch (param.act_type) { case activation::kReLU: ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); + ctx, out_grad, out_data, req, output); break; case activation::kSigmoid: ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); + ctx, out_grad, out_data, req, output); break; case activation::kTanh: ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); + ctx, out_grad, out_data, req, output); break; case activation::kSoftReLU: ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); + ctx, out_grad, out_data, req, output); break; default: LOG(FATAL) << "unknown activation type"; @@ -170,6 +154,35 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs, }); } +template +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + _ActivationCompute(param, ctx, inputs[0], req[0], outputs[0]); +} + +template +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(inputs.size(), 3U); +#else + CHECK_EQ(inputs.size(), 2U); +#endif + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + _ActivationGradCompute(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); +} + } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_ACTIVATION_INL_H_ diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index c437b685ddc6..cfa75eb4e1eb 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -26,11 +26,9 @@ #include "./activation-inl.h" #include "../mshadow_op.h" #include "../tensor/elemwise_unary_op.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_relu-inl.h" -#endif // MXNET_USE_MKL2017 +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_relu-inl.h" +#endif // MXNET_USE_MKLDNN namespace mxnet { namespace op { @@ -51,6 +49,56 @@ struct ActivationGrad { } }; +static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 + if (param.act_type == activation::kReLU) { + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNRelu_Forward(ctx, inputs[0], req[0], outputs[0]); + return; + default: + break; + } + } +#endif + _ActivationCompute(param, ctx, inputs[0].data(), req[0], + outputs[0].data()); +} + +void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(inputs.size(), 3U); +#else + CHECK_EQ(inputs.size(), 2U); +#endif + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + if (param.act_type == activation::kReLU) { + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNRelu_Backward(ctx, inputs[0], inputs[1], req[0], + outputs[0]); + return; + default: + break; + } + } +#endif + _ActivationGradCompute(param, ctx, inputs[0].data(), inputs[1].data(), + req[0], outputs[0].data()); +} + MXNET_OPERATOR_REGISTER_UNARY(Activation) .describe(R"code(Applies an activation function element-wise to the input. @@ -64,6 +112,7 @@ The following activation functions are supported: )code" ADD_FILELINE) .set_attr_parser(ParamParser) .set_attr("FCompute", ActivationCompute) +.set_attr("FComputeEx", ActivationComputeEx_CPU) .set_attr("FGradient", ActivationGrad{"_backward_Activation"}) .add_arguments(ActivationParam::__FIELDS__()); @@ -77,7 +126,8 @@ NNVM_REGISTER_OP(_backward_Activation) return std::vector >{{0, 0}}; }) .set_attr_parser(ParamParser) -.set_attr("FCompute", ActivationGradCompute); +.set_attr("FCompute", ActivationGradCompute) +.set_attr("FComputeEx", ActivationGradComputeEx_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_relu-inl.h b/src/operator/nn/mkldnn/mkldnn_relu-inl.h new file mode 100644 index 000000000000..a9f5a99a43ef --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_relu-inl.h @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_relu-inl.h + * \brief + * \author Da Zheng +*/ + +#ifndef MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ +#define MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../operator_common.h" + +#if MXNET_USE_MKLDNN == 1 + +#include + +namespace mxnet { +namespace op { + +template +mkldnn::memory::data_type GetMKLDNNType() { + return mkldnn::memory::data_type::data_undef; +} + +template<> +mkldnn::memory::data_type GetMKLDNNType() { + return mkldnn::memory::data_type::f32; +} + +template +void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, + const OpReqType &req, const NDArray &out_data) { + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + Dtype alpha = 0; + + mkldnn::eltwise_forward::desc desc = ctx.is_train + ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, + mkldnn::eltwise_relu, data_md, alpha) + : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, + mkldnn::eltwise_relu, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); + + std::vector net; + // TODO should we allocate memory here? + std::shared_ptr output_memory + = out_data.GetMKLDNNData(pdesc.dst_primitive_desc(), net); + net.push_back(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +template +void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, + const NDArray &in_data, const OpReqType &req, + const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + // TODO we need to handle req + std::shared_ptr diff_dst_memory = out_grad.GetMKLDNNData(); + // TODO shouldn't it be out_data? + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); + auto cpu_engine = data_mpd.get_engine(); + Dtype alpha = 0; + + mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, + mkldnn::eltwise_relu, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); + mkldnn::eltwise_backward::desc bw_desc(mkldnn::eltwise_relu, diff_md, data_md, alpha); + mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); + + std::vector net; + std::shared_ptr diff_src_memory + = in_grad.GetMKLDNNData(bw_pdesc.diff_src_primitive_desc(), net); + net.push_back(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + *diff_dst_memory, *diff_src_memory)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +} // namespace op +} // namespace mxnet + +#endif +#endif // MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ From 99c1e0859be834179e639bc60891116d31aa6bd0 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 23:06:16 +0000 Subject: [PATCH 044/264] Fix a compilation error. --- src/ndarray/ndarray.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index c87b0be3e0c7..ea833c82a14f 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -22,6 +22,7 @@ * \file ndarray.cc * \brief ndarry module of mxnet */ +#include #include #include #include From a6c2c82abd8ab0e2fa41fd1f16f8d69c9fa1eb61 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 24 Oct 2017 11:09:20 -0700 Subject: [PATCH 045/264] Change Makefile for MKLDNN. --- Makefile | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 8584ab658e51..e49afdd00f88 100644 --- a/Makefile +++ b/Makefile @@ -40,11 +40,11 @@ endif # use customized config file include $(config) -ifeq ($(USE_MKL2017), 1) -# must run ./prepare_mkl before including mshadow.mk - RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT)) - MKLROOT := $(firstword $(RETURN_STRING)) - export USE_MKLML = $(lastword $(RETURN_STRING)) +ifeq ($(USE_MKLDNN), 1) + RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT)) + MKLDNNROOT := $(firstword $(RETURN_STRING)) + MKLROOT := $(lastword $(RETURN_STRING)) + export USE_MKLML = 1 endif include mshadow/make/mshadow.mk @@ -112,23 +112,16 @@ ifeq ($(USE_NNPACK), 1) LDFLAGS += -lnnpack endif -ifeq ($(USE_MKL2017), 1) - CFLAGS += -DMXNET_USE_MKL2017=1 +ifeq ($(USE_MKLDNN), 1) + CFLAGS += -DMXNET_USE_MKLDNN=1 CFLAGS += -DUSE_MKL=1 - CFLAGS += -I$(ROOTDIR)/src/operator/mkl/ - CFLAGS += -I$(MKLML_ROOT)/include - LDFLAGS += -L$(MKLML_ROOT)/lib - ifeq ($(USE_MKL2017_EXPERIMENTAL), 1) - CFLAGS += -DMKL_EXPERIMENTAL=1 - else - CFLAGS += -DMKL_EXPERIMENTAL=0 - endif - ifeq ($(UNAME_S), Darwin) - LDFLAGS += -lmklml - else - LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu + CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/ + ifneq ($(MKLDNNROOT), $(MKLROOT)) + CFLAGS += -I$(MKLROOT)/include + LDFLAGS += -L$(MKLROOT)/lib endif - LDFLAGS += -liomp5 + CFLAGS += -I$(MKLDNNROOT)/include + LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn endif ifeq ($(USE_OPERATOR_TUNING), 1) @@ -142,7 +135,7 @@ endif # - for Ubuntu, installing atlas will not automatically install the atlas provided lapack library # silently switching lapack off instead of letting the build fail because of backward compatibility ifeq ($(USE_LAPACK), 1) -ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) +ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) ifeq (,$(wildcard /lib/liblapack.a)) ifeq (,$(wildcard /usr/lib/liblapack.a)) ifeq (,$(wildcard /usr/lib64/liblapack.a)) @@ -160,7 +153,7 @@ ifeq ($(USE_LAPACK), 1) ifneq ($(USE_LAPACK_PATH), ) LDFLAGS += -L$(USE_LAPACK_PATH) endif - ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) + ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) LDFLAGS += -llapack endif CFLAGS += -DMXNET_USE_LAPACK @@ -546,7 +539,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN) else clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN) $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \ - R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz + R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \ + external/mkldnn/install/* cd $(DMLC_CORE); $(MAKE) clean; cd - cd $(PS_PATH); $(MAKE) clean; cd - cd $(NNVM_PATH); $(MAKE) clean; cd - From 3f75f52a8214cae793bd22f0962d390b5b38b978 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 24 Oct 2017 19:44:21 -0700 Subject: [PATCH 046/264] Remove infer storage in convolution. --- src/operator/nn/convolution.cc | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 60c56d69d340..248755ec0bf4 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -400,17 +400,6 @@ There are other options to tune the performance. }) .set_attr("FInferShape", ConvolutionShape) .set_attr("FInferType", ConvolutionType) -.set_attr("FInferStorageType", [](const nnvm::NodeAttrs& attrs, - const int dev_mask, DispatchMode* dispatch_mode, - std::vector *in_attrs, std::vector *out_attrs) { - const ConvolutionParam& params = nnvm::get(attrs.parsed); - if (params.no_bias) - return ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, in_attrs, out_attrs); - else - return ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, in_attrs, out_attrs); -}) .set_attr("FCompute", ConvolutionCompute) .set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) .set_attr("FResourceRequest", [](const NodeAttrs& n) { From edf6842673fb39c332f8c5592725dcdf2273d43c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 24 Oct 2017 19:43:08 -0700 Subject: [PATCH 047/264] Update MXNet for MKLDNN. --- include/mxnet/ndarray.h | 99 ++--------- src/common/utils.cc | 16 ++ src/common/utils.h | 12 +- src/ndarray/ndarray.cc | 219 +++++++++++++++++++------ src/operator/tensor/cast_storage-inl.h | 12 +- src/operator/tensor/cast_storage.cc | 44 +++++ 6 files changed, 255 insertions(+), 147 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 1748e1ec5d46..ad31ef47abfe 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -103,44 +103,8 @@ class NDArray { NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, bool delay_alloc = true, int dtype = mshadow::default_type_flag, std::vector aux_types = {}, std::vector aux_shapes = {}, - TShape storage_shape = TShape(mshadow::Shape1(0))) - : shape_(shape), dtype_(dtype), storage_type_(stype), - entry_({nullptr, 0, 0}) { - // Assign default aux types if not given - if (aux_types.size() == 0) { - if (stype == kRowSparseStorage) { - aux_types = {mshadow::kInt64}; - } else if (stype == kCSRStorage) { - aux_types = {mshadow::kInt64, mshadow::kInt64}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - // Assign default shapes if not given - // unknown shapes are intialized as {0} such that Size() would return 0 - if (aux_shapes.size() == 0) { - if (stype == kRowSparseStorage) { - aux_shapes = {TShape(mshadow::Shape1(0))}; - } else if (stype == kCSRStorage) { - // aux shapes for indptr and indices - aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - if (storage_shape.Size() == 0) { - if (stype == kRowSparseStorage) { - storage_shape = shape; - storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; - } else if (stype == kCSRStorage) { - storage_shape = aux_shapes[csr::kIdx]; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, - dtype, aux_types, aux_shapes); - } + TShape storage_shape = TShape(mshadow::Shape1(0))); + /*! * \brief constructing a static NDArray that shares data with TBlob * Use with caution: allocate ONLY ONE NDArray for each TBlob, @@ -591,10 +555,6 @@ class NDArray { std::shared_ptr GetMKLDNNData( const mkldnn::memory::primitive_desc &desc, std::vector &net) const; - std::shared_ptr GetMKLDNNData(); - std::shared_ptr GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc, - std::vector &net); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); @@ -634,6 +594,12 @@ class NDArray { for csr, aux_handles[0] = indptr, aux_handles[1] = indices */ std::vector aux_handles; + +#if MXNET_USE_MKLDNN == 1 + /*! This is created when data is stored in MKLDNN format. + */ + std::shared_ptr Mkl_mem_; +#endif /*! \brief variable from engine */ Engine::VarHandle var; /*! @@ -812,20 +778,14 @@ class NDArray { // storage shape is also updated // if data is already allocated, try reuse the storage. Otherwise, free the current one // and allocate new storage - inline void CheckAndAllocData(const TShape &shape, int dtype) { - CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); - } - // init shape - storage_shape = shape; - // delay_alloc is only set when data storage handle is present - delay_alloc = false; - } + void CheckAndAllocData(const TShape &shape, int dtype); + +#if MXNET_USE_MKLDNN == 1 + // Have MKL memory reference to the data in the default storage + // or create memory for MKLDNN. + void SetMKLMem(const TShape &shape, int dtype); +#endif + // create storage handle for aux data based on shape // this function assumes ctx, aux shapes and aux types are set // aux shape is also updated @@ -866,33 +826,8 @@ class NDArray { } }; // struct Chunk -#if MXNET_USE_MKLDNN == 1 - // Have MKL memory reference to the data in TBlob. - void SetMKLMem(); -#endif + void SetTBlob() const; - void SetTBlob() const { - CHECK(ptr_ != nullptr); - TShape shape = shape_; - char *dptr = static_cast(ptr_->shandle.dptr); - auto stype = storage_type(); - if (stype == kDefaultStorage) { - dptr += byte_offset_; - } else if (stype == kCSRStorage || stype == kRowSparseStorage) { - CHECK_NE(byte_offset_, 0); - shape = storage_shape(); - } else { - LOG(FATAL) << "unknown storage type " << stype; - } - tblob_.dptr_ = dptr; - tblob_.shape_ = shape; - tblob_.type_flag_ = dtype_; - tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); - } - -#if MXNET_USE_MKLDNN == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief internal data of NDArray */ std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ diff --git a/src/common/utils.cc b/src/common/utils.cc index 784fcf8651ae..8f79fb870879 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -41,5 +41,21 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } +std::string stype_string(const int x) { + switch (x) { + case kDefaultStorage: + return "default"; + case kCSRStorage: + return "csr"; + case kRowSparseStorage: + return "row_sparse"; +#if MXNET_USE_MKLDNN == 1 + case kMKLDNNStorage: + return "mkldnn"; +#endif + } + return "unknown"; +} + } // namespace common } // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index 038ab2a04721..fcdf402fafed 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -327,17 +327,7 @@ inline std::string dispatch_mode_string(const DispatchMode x) { /*! \brief get string representation of storage_type */ -inline std::string stype_string(const int x) { - switch (x) { - case kDefaultStorage: - return "default"; - case kCSRStorage: - return "csr"; - case kRowSparseStorage: - return "row_sparse"; - } - return "unknown"; -} +std::string stype_string(const int x); // heuristic to dermine number of threads per GPU inline int GetNumThreadPerGPU() { diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index ea833c82a14f..604a0fa53356 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -36,6 +36,7 @@ #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" +#include "../operator/nn/mkldnn/mkldnn_base-inl.h" #if MXNET_USE_OPENCV #include @@ -47,6 +48,79 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { +NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, + bool delay_alloc, int dtype, std::vector aux_types, + std::vector aux_shapes, TShape storage_shape) : shape_(shape), + dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0 +#if MXNET_USE_MKLDNN == 1 + && stype != kMKLDNNStorage +#endif + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +} + +void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { + if (storage_type == kMKLDNNStorage) { + SetMKLMem(shape, dtype); + } + else { + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; + } +} + NDArray NDArray::grad() const { if (Imperative::AGInfo::IsNone(*this)) return NDArray(); Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node); @@ -182,6 +256,7 @@ void NDArray::set_fresh_out_grad(bool state) const { info.fresh_out_grad = state; } +#if MXNET_USE_MKLDNN == 1 static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { switch(dtype) { case mshadow::kFloat32: @@ -191,70 +266,66 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { } } -#if MXNET_USE_MKLDNN == 1 -void NDArray::SetMKLMem() { - if (Mkl_mem_ || storage_type() != kDefaultStorage) +void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { + if (Mkl_mem_) return; - mkldnn::memory::dims dims(shape_.ndim()); + mkldnn::memory::dims dims(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) - dims[i] = shape_[i]; - mkldnn::memory::desc data_md({dims}, get_mkldnn_type(dtype_), - // TODO is this the right layout? - mkldnn::memory::format::nchw); - // TODO do I specify the right CPU index? - auto cpu_engine = mkldnn::engine(mkldnn::engine::cpu, 0); - Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, - cpu_engine), data().dptr_)); + dims[i] = shape[i]; + mkldnn::memory::format layout = mkldnn::memory::format::format_undef; + switch (shape.ndim()) { + case 1: layout = mkldnn::memory::format::x; break; + case 2: layout = mkldnn::memory::format::nc; break; + case 4: layout = mkldnn::memory::format::nchw; break; + default: LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; + } + mkldnn::memory::desc data_md({dims}, get_mkldnn_type(dtype), layout); + auto cpu_engine = CpuEngine::Instance().get_engine(); + // If the storage type is the default type, we can just simply + // reference to the memory for the default storage. + if (storage_type == kDefaultStorage) { + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + cpu_engine), shandle.dptr)); + } + // If the array uses MKLDNN storage, we need to allocate memory here. + else if (storage_type == kMKLDNNStorage) { + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + cpu_engine))); + } +} + +static int GetTypeSize(int dtype) { + MSHADOW_TYPE_SWITCH(dtype, DType, { + return sizeof(DType); + }); + return -1; } std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc, std::vector &net) const { - const_cast(this)->SetMKLMem(); - if (Mkl_mem_ && Mkl_mem_->get_primitive_desc() == desc) - return Mkl_mem_; - else if (Mkl_mem_) { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + if (ptr_->storage_type == kDefaultStorage) { + ptr_->Mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); + } + if (ptr_->Mkl_mem_->get_primitive_desc() == desc) + return ptr_->Mkl_mem_; + else { // TODO we should manage the memory allocation here. std::shared_ptr ret(new mkldnn::memory(desc)); - net.push_back(mkldnn::reorder(*Mkl_mem_, *ret)); + net.push_back(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; } - else - // TODO We don't support converting sparse format. - return nullptr; } std::shared_ptr NDArray::GetMKLDNNData() const { - const_cast(this)->SetMKLMem(); - if (Mkl_mem_) - return Mkl_mem_; - else - // TODO We don't support converting sparse format. - return nullptr; -} - -std::shared_ptr NDArray::GetMKLDNNData() { - SetMKLMem(); - if (Mkl_mem_) - return Mkl_mem_; - else - // TODO We don't support converting sparse format. - return nullptr; -} - -std::shared_ptr NDArray::GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc, - std::vector &net) { - SetMKLMem(); - if (Mkl_mem_ && Mkl_mem_->get_primitive_desc() == desc) - return Mkl_mem_; - else if (Mkl_mem_) { - // TODO we should manage the memory allocation here. - std::shared_ptr ret(new mkldnn::memory(desc)); - net.push_back(mkldnn::reorder(*Mkl_mem_, *ret)); - return ret; - } + ptr_->SetMKLMem(shape_, dtype_); + if (ptr_->Mkl_mem_) + return ptr_->Mkl_mem_; else // TODO We don't support converting sparse format. return nullptr; @@ -262,14 +333,42 @@ std::shared_ptr NDArray::GetMKLDNNData( std::shared_ptr NDArray::CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc) { - CHECK(Mkl_mem_ == nullptr); - CHECK(storage_type() == kMKLDNNStorage); + if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) + return ptr_->Mkl_mem_; + + // TODO the shape should also match. + CHECK_EQ(storage_type(), kMKLDNNStorage); // TODO we should manage the memory allocation here. - Mkl_mem_.reset(new mkldnn::memory(desc)); - return Mkl_mem_; + ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); + return ptr_->Mkl_mem_; } #endif +void NDArray::SetTBlob() const { + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + CHECK_EQ(byte_offset_, 0); + shape = storage_shape(); +#if MXNET_USE_MKLDNN == 1 + } else if (stype == kMKLDNNStorage) { + // TODO we may really need to convert format. + CHECK_EQ(byte_offset_, 0); + dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); +#endif + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; + tblob_.type_flag_ = dtype_; + tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); +} + /*! * \brief run a ternary operation * \param lhs left operand @@ -544,6 +643,16 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext from.ctx(), to.ctx(), ctx); } +#if MXNET_USE_MKLDNN == 1 +inline void CopyFromToMKLDNNImpl(const NDArray& from, const NDArray& to, RunContext ctx) { + auto from_mem = from.GetMKLDNNData(); + auto to_mem = to.GetMKLDNNData(); + size_t size = std::min(from_mem->get_primitive_desc().get_size(), + to_mem->get_primitive_desc().get_size()); + memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); +} +#endif + // Make a copy of an NDArray based on storage type template void CopyFromToImpl(const NDArray& from, const NDArray& to, @@ -590,6 +699,10 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to, CopyFromToRspImpl(casted_nd, to, rctx); } else if (to_stype == kCSRStorage) { CopyFromToCsrImpl(casted_nd, to, rctx); +#if MXNET_USE_MKLDNN == 1 + } else if (to_stype == kMKLDNNStorage) { + CopyFromToMKLDNNImpl(casted_nd, to, rctx); +#endif } else { LOG(FATAL) << "unknown storage type" << to_stype; } diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index ebe19d41bbc4..8cb62bdaabac 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -324,6 +324,9 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, }); } +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns); +void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); + template void CastStorageComputeImpl(const OpContext& ctx, const NDArray& input, @@ -342,8 +345,15 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); +#if MXNET_USE_MKLDNN == 1 + } else if (src_stype == kMKLDNNStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageMKLDnsImpl(ctx, input, &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kMKLDNNStorage) { + CastStorageDnsMKLImpl(ctx, input, output); +#endif } else { - LOG(FATAL) << "Not implemented"; + LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; } } diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 9f257b140f7b..f1c226c9c83e 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -25,10 +25,54 @@ #include "./cast_storage-inl.h" #include "../elemwise_op_common.h" #include "../tensor/elemwise_unary_op.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch(dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + default: + return mkldnn::memory::data_type::data_undef; + } +} + +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) { + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK(src.shape() == dns->shape_); + CHECK_EQ(src.dtype(), dns->type_flag_); + + mkldnn::memory::dims dims(dns->shape_.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = dns->shape_[i]; + mkldnn::memory::format layout = mkldnn::memory::format::format_undef; + switch (dns->shape_.ndim()) { + case 1: layout = mkldnn::memory::format::x; break; + case 2: layout = mkldnn::memory::format::nc; break; + case 4: layout = mkldnn::memory::format::nchw; break; + default: LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; + } + mkldnn::memory::desc data_md({dims}, get_mkldnn_type(src.dtype()), layout); + auto cpu_engine = CpuEngine::Instance().get_engine(); + mkldnn::memory dst_mem(mkldnn::memory::primitive_desc(data_md, cpu_engine), dns->dptr_); + + std::vector net; + net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), dst_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK(dst.shape() == src.shape()); + CHECK_EQ(dst.dtype(), src.dtype()); + + std::vector net; + net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), *dst.GetMKLDNNData())); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + DMLC_REGISTER_PARAMETER(CastStorageParam); NNVM_REGISTER_OP(cast_storage) .add_alias("_sparse_cast_storage") From c96ca2652d3412c5f879de43384b1e2078fb08a5 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Tue, 31 Oct 2017 15:52:55 +0000 Subject: [PATCH 048/264] Support MKLDNN storage type in python. --- python/mxnet/ndarray/ndarray.py | 1 + python/mxnet/ndarray/sparse.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index a45a6a82471e..885048e3ae91 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -52,6 +52,7 @@ _STORAGE_TYPE_DEFAULT = 0 _STORAGE_TYPE_ROW_SPARSE = 1 _STORAGE_TYPE_CSR = 2 +_STORAGE_TYPE_MKLDNN = 3 # pylint: disable= no-member _DTYPE_NP_TO_MX = { diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index 700dee0b07fa..0a667741e144 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -49,7 +49,7 @@ pass from ._internal import _set_ndarray_class from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR +from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR, _STORAGE_TYPE_MKLDNN from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT from .ndarray import zeros as _zeros_ndarray from .ndarray import array as _array @@ -1138,6 +1138,8 @@ def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED): stype = _storage_type(handle) if stype == _STORAGE_TYPE_DEFAULT: return NDArray(handle, writable=writable) + elif stype == _STORAGE_TYPE_MKLDNN: + return NDArray(handle, writable=False) elif stype == _STORAGE_TYPE_CSR: return CSRNDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_ROW_SPARSE: From 1a6e06ec5613d5f5365ee70ebd8051b014eae6c0 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 24 Oct 2017 19:43:32 -0700 Subject: [PATCH 049/264] Update activation. --- src/operator/nn/activation.cc | 48 ++++++++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_relu-inl.h | 16 ++------ 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index cfa75eb4e1eb..1e18f4adfb5e 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -99,6 +99,52 @@ void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, req[0], outputs[0].data()); } +inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + if (param.act_type == activation::kReLU + && dev_mask == mshadow::cpu::kDevMask) { + // TODO we don't know the type. + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + return ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +} + +inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(in_attrs->size(), 3U); +#else + CHECK_EQ(in_attrs->size(), 2U); +#endif + CHECK_EQ(out_attrs->size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_MKLDNN == 1 + if (param.act_type == activation::kReLU + && dev_mask == mshadow::cpu::kDevMask) { + // TODO we don't know the type. + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + return ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +} + MXNET_OPERATOR_REGISTER_UNARY(Activation) .describe(R"code(Applies an activation function element-wise to the input. @@ -111,6 +157,7 @@ The following activation functions are supported: )code" ADD_FILELINE) .set_attr_parser(ParamParser) +.set_attr("FInferStorageType", ActivationStorageType) .set_attr("FCompute", ActivationCompute) .set_attr("FComputeEx", ActivationComputeEx_CPU) .set_attr("FGradient", ActivationGrad{"_backward_Activation"}) @@ -120,6 +167,7 @@ NNVM_REGISTER_OP(_backward_Activation) .set_num_inputs(3) .set_num_outputs(1) .set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_ActStorageType) .set_attr("FInferShape", ElemwiseShape<3, 1>) .set_attr("FInferType", ElemwiseType<3, 1>) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ diff --git a/src/operator/nn/mkldnn/mkldnn_relu-inl.h b/src/operator/nn/mkldnn/mkldnn_relu-inl.h index a9f5a99a43ef..ada4bebe81d4 100644 --- a/src/operator/nn/mkldnn/mkldnn_relu-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_relu-inl.h @@ -36,6 +36,7 @@ #include #include #include "../../operator_common.h" +#include "./mkldnn_base-inl.h" #if MXNET_USE_MKLDNN == 1 @@ -44,16 +45,6 @@ namespace mxnet { namespace op { -template -mkldnn::memory::data_type GetMKLDNNType() { - return mkldnn::memory::data_type::data_undef; -} - -template<> -mkldnn::memory::data_type GetMKLDNNType() { - return mkldnn::memory::data_type::f32; -} - template void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { @@ -71,9 +62,8 @@ void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); std::vector net; - // TODO should we allocate memory here? std::shared_ptr output_memory - = out_data.GetMKLDNNData(pdesc.dst_primitive_desc(), net); + = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); net.push_back(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } @@ -104,7 +94,7 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, std::vector net; std::shared_ptr diff_src_memory - = in_grad.GetMKLDNNData(bw_pdesc.diff_src_primitive_desc(), net); + = const_cast(in_grad).CreateMKLDNNData(bw_pdesc.diff_src_primitive_desc()); net.push_back(mkldnn::eltwise_backward(bw_pdesc, *input_mem, *diff_dst_memory, *diff_src_memory)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); From ca30cac1a5b7ec3632558db8cb4308176d29894a Mon Sep 17 00:00:00 2001 From: Da zheng Date: Thu, 26 Oct 2017 01:34:43 +0000 Subject: [PATCH 050/264] Add MKLDNN base classes. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 125 +++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 src/operator/nn/mkldnn/mkldnn_base-inl.h diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h new file mode 100644 index 000000000000..2bad903a143e --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -0,0 +1,125 @@ +/******************************************************************************* +* Copyright 2016-2017 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkldnn_base-inl.h +* \brief +* \author young.jin.kim@intel.com +* ashok.emani@intel.com +* deepthi.karkada@intel.com +* louis.feng@intel.com +* adam.d.straw@intel.com +* +*******************************************************************************/ + +#ifndef MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ +#define MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include +#include +#include "mkldnn.hpp" + +namespace mxnet { +extern bool EnableMkldnnWarnGenerated(); +// ===== CpuEngine ======================================= +// cpu_engine singleton +class CpuEngine { + public: + static CpuEngine & Instance() { + // I's thread-safe in C++11. + static thread_local CpuEngine myInstance; + return myInstance; + } + CpuEngine(CpuEngine const&) = delete; // Copy construct + CpuEngine(CpuEngine&&) = delete; // Move construct + CpuEngine& operator=(CpuEngine const&) = delete; // Copy assign + CpuEngine& operator=(CpuEngine &&) = delete; // Move assign + + mkldnn::engine & get_engine() { return _cpu_engine; } + protected: + CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} + ~CpuEngine() {} + private: + mkldnn::engine _cpu_engine; +}; + +// type enumerator +template +struct data_type_enum {}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::f32 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s32 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s16 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s8 }; +}; + +template<> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::u8 }; +}; + +inline static std::shared_ptr GetWeights(const NDArray &arr, + const mkldnn::engine &engine, int num_groups = 1) { + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + std::vector net; + return arr.GetMKLDNNData(pd, net); + } + else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + std::vector net; + return arr.GetMKLDNNData(pd, net); + } + else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + std::vector net; + return arr.GetMKLDNNData(pd, net); + } + else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } +} + +} // namespace mxnet +#endif +#endif // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ From 79c563cd1138cc33bf88e6c0971232b666ec2ec1 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Sat, 28 Oct 2017 00:12:35 +0000 Subject: [PATCH 051/264] Implement MKLDNN fully connected. --- src/operator/nn/fully_connected.cc | 93 ++++++++++- .../nn/mkldnn/mkldnn_fully_connected.cc | 158 ++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_ops-inl.h | 54 ++++++ 3 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_fully_connected.cc create mode 100644 src/operator/nn/mkldnn/mkldnn_ops-inl.h diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index c4edf6dcab9b..1178c0729bd8 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -23,6 +23,7 @@ * \brief fully connect operator */ #include "./fully_connected-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_fully_connected-inl.h" #endif // MXNET_USE_NNPACK @@ -71,6 +72,46 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, return true; } +void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNFC_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + FullyConnectedCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, const std::vector &inputs, + const std::vector &req, const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNFC_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + FullyConnectedGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, std::vector *in_type, std::vector *out_type) { CHECK_GE(in_type->size(), 1U); @@ -89,6 +130,52 @@ struct FullyConnectedGrad { } }; +inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_FCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + DMLC_REGISTER_PARAMETER(FullyConnectedParam); NNVM_REGISTER_OP(FullyConnected) @@ -119,6 +206,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. }) .set_num_outputs(1) .set_attr_parser(ParamParser) +.set_attr("FInferStorageType", FCStorageType) .set_attr("FListInputNames", [](const NodeAttrs& attrs) { const FullyConnectedParam& params = nnvm::get(attrs.parsed); if (!params.no_bias) { @@ -130,6 +218,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. .set_attr("FInferShape", FullyConnectedShape) .set_attr("FInferType", FullyConnectedType) .set_attr("FCompute", FullyConnectedCompute) +.set_attr("FComputeEx", FullyConnectedCompute_CPU) .set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) .add_argument("data", "NDArray-or-Symbol", "Input data.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") @@ -145,8 +234,10 @@ NNVM_REGISTER_OP(_backward_FullyConnected) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{1, 0}}; }) +.set_attr("FInferStorageType", backward_FCStorageType) .set_attr_parser(ParamParser) -.set_attr("FCompute", FullyConnectedGradCompute); +.set_attr("FCompute", FullyConnectedGradCompute) +.set_attr("FComputeEx", FullyConnectedGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc new file mode 100644 index 000000000000..49419f7c1fc3 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_fully_connected.cc + * \brief + * \author Da Zheng +*/ + +#include "../fully_connected-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( + const mkldnn::memory::desc &data_desc, const mkldnn::memory::desc &weight_desc, + const mkldnn::memory::desc &out_desc, const mkldnn::engine &engine, + std::shared_ptr bias_mem) { + if (bias_mem) { + auto bias_desc = bias_mem->get_primitive_desc().desc(); + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_desc, weight_desc, bias_desc, out_desc); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } + else { + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_desc, weight_desc, out_desc); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } +} + +inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwd( + const mkldnn::memory::desc &data_desc, const mkldnn::memory::desc &weight_desc, + const mkldnn::memory::desc &out_desc, const mkldnn::engine &engine, + mkldnn::inner_product_forward::primitive_desc ipFwd_pd, + std::shared_ptr bias_mem) { + if (bias_mem) { + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_desc, + weight_desc, bias_mem->get_primitive_desc().desc(), out_desc); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } + else { + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_desc, + weight_desc, out_desc); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } +} + +void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + auto data_mem = in_data[fullc::kData].GetMKLDNNData(); + auto data_desc = data_mem->get_primitive_desc().desc(); + auto cpu_engine = data_mem->get_primitive_desc().get_engine(); + CHECK_EQ(in_data[fullc::kWeight + 1].shape().ndim(), 2); + auto weight_mem = GetWeights(in_data[fullc::kWeight], cpu_engine); + auto weight_desc = weight_mem->get_primitive_desc().desc(); + auto out_mem = const_cast(out_data[fullc::kOut]).GetMKLDNNData(); + auto out_desc = out_mem->get_primitive_desc().desc(); + + std::vector net; + if (param.no_bias) { + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( + data_desc, weight_desc, out_desc, cpu_engine, nullptr); + CHECK(ipFwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + CHECK(ipFwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + CHECK(ipFwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, + *out_mem)); + } else { + auto bias_mem = in_data[fullc::kBias].GetMKLDNNData(); + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( + data_desc, weight_desc, out_desc, cpu_engine, bias_mem); + CHECK(ipFwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + CHECK(ipFwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + CHECK(ipFwd_pd.bias_primitive_desc() == bias_mem->get_primitive_desc()); + CHECK(ipFwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, + *bias_mem, *out_mem)); + } + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs) { + const std::vector &in_grad = outputs; + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData(); + auto out_grad_desc = out_grad_mem->get_primitive_desc().desc(); + auto data_mem = inputs[fullc::kData + 1].GetMKLDNNData(); + auto data_desc = data_mem->get_primitive_desc().desc(); + auto cpu_engine = data_mem->get_primitive_desc().get_engine(); + CHECK_EQ(inputs[fullc::kWeight + 1].shape().ndim(), 2); + auto weight_mem = GetWeights(inputs[fullc::kWeight + 1], cpu_engine); + auto weight_desc = weight_mem->get_primitive_desc().desc(); + std::shared_ptr in_grad_bias; + if (!param.no_bias) + in_grad_bias = const_cast(in_grad[fullc::kBias]).GetMKLDNNData(); + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data_desc, + weight_desc, out_grad_desc, cpu_engine, in_grad_bias); + + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + std::vector net; + if (req[fullc::kData]) { + mkldnn::inner_product_backward_data::desc ipBwdData_desc(data_desc, weight_desc, + out_grad_desc); + mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd(ipBwdData_desc, + cpu_engine, ipFwd_pd); + CHECK(ipBwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); + CHECK(ipBwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + auto in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( + ipBwdData_pd.diff_src_primitive_desc()); + net.push_back(mkldnn::inner_product_backward_data(ipBwdData_pd, *out_grad_mem, + *weight_mem, *in_grad_mem)); + } + if (req[fullc::kWeight]) { + mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetIPBwd( + data_desc, weight_desc, out_grad_desc, cpu_engine, ipFwd_pd, in_grad_bias); + CHECK(ipBwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); + CHECK(ipBwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + auto in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( + ipBwdWeights_pd.diff_weights_primitive_desc()); + if (param.no_bias) { + net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, + *data_mem, *out_grad_mem, *in_grad_weight)); + } else { + net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, + *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + } + } + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +} +} +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h new file mode 100644 index 000000000000..73b95867f396 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_ops-inl.h + * \brief + * \author Da Zheng +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +/* For fully connected. */ +void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const std::vector &req, + const std::vector &outputs); + +} +} +#endif // MXNET_USE_MKLDNN == 1 + +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ From 2f5ed280015a70ae1cb0a48bd71f6dbdf194d703 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Tue, 31 Oct 2017 15:48:39 +0000 Subject: [PATCH 052/264] Add MKLDNN convolution. --- src/operator/nn/convolution.cc | 185 ++++++++++---- src/operator/nn/mkldnn/mkldnn_convolution.cc | 253 +++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_ops-inl.h | 8 + 3 files changed, 397 insertions(+), 49 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_convolution.cc diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 248755ec0bf4..3b3a2cdc963d 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -26,11 +26,7 @@ #include "./convolution-inl.h" #include "../elemwise_op_common.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_convolution-inl.h" -#endif // MXNET_USE_MKL2017 +#include "./mkldnn/mkldnn_ops-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK @@ -51,6 +47,46 @@ static inline std::vector ListArguments(const ConvolutionParam& par } } +static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConvolutionCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConvolutionGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, std::vector *in_shape, std::vector *out_shape) { @@ -67,50 +103,50 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, if (dshp.ndim() == 0) return false; if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } + // 1d conv + CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<3> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - return true; + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<3> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + return true; } else if (param_.kernel.ndim() == 2) { // 2d conv CHECK_EQ(dshp.ndim(), 4U) \ @@ -259,6 +295,53 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs, return true; } +inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 3 : 4; + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) { using namespace mshadow; ConvolutionParam param_; @@ -400,7 +483,9 @@ There are other options to tune the performance. }) .set_attr("FInferShape", ConvolutionShape) .set_attr("FInferType", ConvolutionType) +.set_attr("FInferStorageType", ConvStorageType) .set_attr("FCompute", ConvolutionCompute) +.set_attr("FComputeEx", ConvolutionCompute_CPU) .set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; @@ -416,11 +501,13 @@ NNVM_REGISTER_OP(_backward_Convolution) return params.no_bias ? 2 : 3; }) .set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_ConvStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) .set_attr_parser(ConvolutionParamParser) -.set_attr("FCompute", ConvolutionGradCompute); +.set_attr("FCompute", ConvolutionGradCompute) +.set_attr("FComputeEx", ConvolutionGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc new file mode 100644 index 000000000000..c137446a595d --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_convolution.cc + * \brief + * \author Da Zheng +*/ + +#include "../convolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +static mkldnn::convolution_forward::primitive_desc GetConvFwd( + const ConvolutionParam& param, bool is_train, + const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, + const mkldnn::memory::desc &out_md, const mkldnn::engine &engine, + std::shared_ptr bias_mem) { + auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (/*param.dilate.ndim() == 0 &&*/ bias_mem == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else /*if (param.dilate.ndim() == 0)*/ { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, + strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } +// else { +// // TODO I should test the case with dilate. +// mkldnn::memory::dims dilates{0, 0}; +// if (param.dilate.ndim() == 2) { +// dilates[0] = param.dilate[0]; +// dilates[1] = param.dilate[1]; +// } +// if (bias_mem == nullptr) { +// mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, +// data_md, weights_md, out_md, strides, dilates, padding, padding, +// mkldnn::padding_kind::zero); +// return mkldnn::convolution_forward::primitive_desc(desc, engine); +// } +// else { +// mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, +// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, +// strides, dilates, padding, padding, mkldnn::padding_kind::zero); +// return mkldnn::convolution_forward::primitive_desc(desc, engine); +// } +// } +} + +static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( + const ConvolutionParam& param, const mkldnn::memory::desc &data_md, + const mkldnn::memory::desc &weights_md, const mkldnn::memory::desc &out_md, + const mkldnn::engine &engine, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } +// if (param.dilate.ndim() == 0) { + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); +// } +// else { +// // TODO I should test the case with dilate. +// mkldnn::memory::dims dilates{0, 0}; +// if (param.dilate.ndim() == 2) { +// dilates[0] = param.dilate[0]; +// dilates[1] = param.dilate[1]; +// } +// mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, out_md, strides, dilates, padding, padding, +// mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); +// } +} + +static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( + const ConvolutionParam& param, const mkldnn::memory::desc &data_md, + const mkldnn::memory::desc &weights_md, const mkldnn::memory::desc &out_md, + const mkldnn::engine &engine, const mkldnn::convolution_forward::primitive_desc &fwd_pd, + std::shared_ptr bias_mem) { + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (/*param.dilate.ndim() == 0 &&*/ bias_mem == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else /*if (param.dilate.ndim() == 0)*/ { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, + strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } +// else { +// // TODO I should test the case with dilate. +// mkldnn::memory::dims dilates{0, 0}; +// if (param.dilate.ndim() == 2) { +// dilates[0] = param.dilate[0]; +// dilates[1] = param.dilate[1]; +// } +// if (bias_mem == nullptr) { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, out_md, strides, dilates, padding, padding, +// mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// else { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, +// strides, dilates, padding, padding, mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// } +} + +void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + auto data_mem = in_data[conv::kData].GetMKLDNNData(); + auto data_desc = data_mem->get_primitive_desc().desc(); + auto cpu_engine = data_mem->get_primitive_desc().get_engine(); + auto weight_mem = GetWeights(in_data[conv::kWeight], cpu_engine, param.num_group); + auto weight_desc = weight_mem->get_primitive_desc().desc(); + auto out_mem = const_cast(out_data[conv::kOut]).GetMKLDNNData(); + auto out_desc = out_mem->get_primitive_desc().desc(); + + std::vector net; + if (param.no_bias) { + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, + ctx.is_train, data_desc, weight_desc, out_desc, cpu_engine, nullptr); + CHECK(fwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + CHECK(fwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + CHECK(fwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, + *out_mem)); + } else { + auto bias_mem = in_data[conv::kBias].GetMKLDNNData(); + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, + ctx.is_train, data_desc, weight_desc, out_desc, cpu_engine, bias_mem); + CHECK(fwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + CHECK(fwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + CHECK(fwd_pd.bias_primitive_desc() == bias_mem->get_primitive_desc()); + CHECK(fwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, + *bias_mem, *out_mem)); + } + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { + const std::vector &in_grad = outputs; + const ConvolutionParam& param = nnvm::get(attrs.parsed); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData(); + auto out_grad_desc = out_grad_mem->get_primitive_desc().desc(); + auto data_mem = inputs[conv::kData + 1].GetMKLDNNData(); + auto data_desc = data_mem->get_primitive_desc().desc(); + auto cpu_engine = data_mem->get_primitive_desc().get_engine(); + auto weight_mem = GetWeights(inputs[conv::kWeight + 1], cpu_engine, + param.num_group); + auto weight_desc = weight_mem->get_primitive_desc().desc(); + std::shared_ptr in_grad_bias; + if (!param.no_bias) + in_grad_bias = const_cast(in_grad[conv::kBias]).GetMKLDNNData(); + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, + data_desc, weight_desc, out_grad_desc, cpu_engine, in_grad_bias); + + CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; + std::vector net; + if (req[conv::kData]) { + mkldnn::convolution_backward_data::primitive_desc bwdData_pd + = GetConvBwdData(param, data_desc, weight_desc, out_grad_desc, cpu_engine, fwd_pd); + CHECK(bwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); + CHECK(bwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + auto in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( + bwdData_pd.diff_src_primitive_desc()); + net.push_back(mkldnn::convolution_backward_data(bwdData_pd, *out_grad_mem, + *weight_mem, *in_grad_mem)); + } + if (req[conv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetConvBwdWeights(param, data_desc, weight_desc, out_grad_desc, + cpu_engine, fwd_pd, in_grad_bias); + CHECK(bwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); + CHECK(bwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + auto in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( + bwdWeights_pd.diff_weights_primitive_desc()); + if (param.no_bias) { + net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, + *data_mem, *out_grad_mem, *in_grad_weight)); + } else { + net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, + *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + } + } + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +} +} + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 73b95867f396..e2c8b986e407 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -47,6 +47,14 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs); +/* For convolution. */ +void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + } } #endif // MXNET_USE_MKLDNN == 1 From 126b85e50e1ef59c0cb2fa55a3eef87b33a55816 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Thu, 2 Nov 2017 19:59:03 +0000 Subject: [PATCH 053/264] Update MKLDNN interface in NDArray. --- include/mxnet/ndarray.h | 1 + src/ndarray/ndarray.cc | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index ad31ef47abfe..db9784120703 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -556,6 +556,7 @@ class NDArray { const mkldnn::memory::primitive_desc &desc, std::vector &net) const; + void CopyFrom(const mkldnn::memory &mem, std::vector &net); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); #endif diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 604a0fa53356..0ed9b0c97d8e 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -331,13 +331,29 @@ std::shared_ptr NDArray::GetMKLDNNData() const { return nullptr; } +void NDArray::CopyFrom(const mkldnn::memory &mem, + std::vector &net) { + if (ptr_ == nullptr) { + LOG(FATAL) << "The NDArray hasn't been initialized"; + return; + } + ptr_->SetMKLMem(shape_, dtype_); + net.push_back(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); +} + std::shared_ptr NDArray::CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc) { + if (storage_type() != kMKLDNNStorage) + return nullptr; + + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) return ptr_->Mkl_mem_; - // TODO the shape should also match. - CHECK_EQ(storage_type(), kMKLDNNStorage); // TODO we should manage the memory allocation here. ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); return ptr_->Mkl_mem_; From 76726089564b7e6e500db9b96fa62dc281562ad8 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Thu, 2 Nov 2017 20:01:27 +0000 Subject: [PATCH 054/264] MKLDNN convolution handle CreateMKLDNNData failure. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 6 ++++++ src/operator/nn/mkldnn/mkldnn_convolution.cc | 21 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 2bad903a143e..a0a5da2a94f2 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -120,6 +120,12 @@ inline static std::shared_ptr GetWeights(const NDArray &ar } } +inline static std::shared_ptr CreateMKLDNNMem( + const mkldnn::memory::primitive_desc &desc) { + // TODO allocate memory more efficiently. + return std::shared_ptr(new mkldnn::memory(desc)); +} + } // namespace mxnet #endif #endif // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index c137446a595d..7ac0c3a473bd 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -218,15 +218,24 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; std::vector net; + std::shared_ptr in_grad_mem, in_grad_weight; if (req[conv::kData]) { mkldnn::convolution_backward_data::primitive_desc bwdData_pd = GetConvBwdData(param, data_desc, weight_desc, out_grad_desc, cpu_engine, fwd_pd); CHECK(bwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); CHECK(bwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - auto in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( + + in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( bwdData_pd.diff_src_primitive_desc()); + bool copy_back = false; + if (in_grad_mem == nullptr) { + in_grad_mem = CreateMKLDNNMem(bwdData_pd.diff_src_primitive_desc()); + copy_back = true; + } net.push_back(mkldnn::convolution_backward_data(bwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem)); + if (copy_back) + const_cast(in_grad[conv::kData]).CopyFrom(*in_grad_mem, net); } if (req[conv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd @@ -234,8 +243,13 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c cpu_engine, fwd_pd, in_grad_bias); CHECK(bwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); CHECK(bwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - auto in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( + in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( bwdWeights_pd.diff_weights_primitive_desc()); + bool copy_back = false; + if (in_grad_weight == nullptr) { + in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); + copy_back = true; + } if (param.no_bias) { net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); @@ -243,6 +257,9 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } + if (copy_back) { + const_cast(in_grad[conv::kWeight]).CopyFrom(*in_grad_weight, net); + } } mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } From 4064bef52aaa6e6ca71db924d8904bfbb642f4b4 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 01:22:28 +0000 Subject: [PATCH 055/264] Add another GetMKLDNNData in NDArray. --- include/mxnet/ndarray.h | 14 ++++++++++++++ src/ndarray/ndarray.cc | 25 ++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index db9784120703..5cf9f50256af 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -551,7 +551,21 @@ class NDArray { } #if MXNET_USE_MKLDNN == 1 + /* + * This function returns mkldnn::memory with the default primitive_desc. + */ std::shared_ptr GetMKLDNNData() const; + /* + * This function returns mkldnn::memory with the given primitive_desc + * as long as the array size meets the required size in the given primitive_desc. + */ + std::shared_ptr GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const; + /* + * This function returns mkldnn::memory with the given primitive_desc. + * The returned mkldnn::memory will have the same physical layout as + * the given primitive_desc. + */ std::shared_ptr GetMKLDNNData( const mkldnn::memory::primitive_desc &desc, std::vector &net) const; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 0ed9b0c97d8e..3ef4e5d74976 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -257,15 +257,6 @@ void NDArray::set_fresh_out_grad(bool state) const { } #if MXNET_USE_MKLDNN == 1 -static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { - switch(dtype) { - case mshadow::kFloat32: - return mkldnn::memory::data_type::f32; - default: - return mkldnn::memory::data_type::data_undef; - } -} - void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { if (Mkl_mem_) return; @@ -280,7 +271,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { case 4: layout = mkldnn::memory::format::nchw; break; default: LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; } - mkldnn::memory::desc data_md({dims}, get_mkldnn_type(dtype), layout); + mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; auto cpu_engine = CpuEngine::Instance().get_engine(); // If the storage type is the default type, we can just simply // reference to the memory for the default storage. @@ -302,6 +293,18 @@ static int GetTypeSize(int dtype) { return -1; } +std::shared_ptr NDArray::GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + if (ptr_->Mkl_mem_) + return ptr_->Mkl_mem_; + return std::shared_ptr(new mkldnn::memory(desc, + ptr_->shandle.dptr)); +} + std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc, std::vector &net) const { @@ -310,7 +313,7 @@ std::shared_ptr NDArray::GetMKLDNNData( return nullptr; } if (ptr_->storage_type == kDefaultStorage) { - ptr_->Mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); + ptr_->SetMKLMem(shape_, dtype_); } if (ptr_->Mkl_mem_->get_primitive_desc() == desc) return ptr_->Mkl_mem_; From 9c6bf6f27a0c03c2867c19afcf99a231c45a6d8c Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 01:30:00 +0000 Subject: [PATCH 056/264] Have mkldnn to define the data format. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 92 ++++++++++-- src/operator/nn/mkldnn/mkldnn_convolution.cc | 142 ++++++++++--------- 2 files changed, 159 insertions(+), 75 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index a0a5da2a94f2..99431887fa11 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -85,6 +85,86 @@ struct data_type_enum { enum { type = mkldnn::memory::data_type::u8 }; }; +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch(dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + default: + return mkldnn::memory::data_type::data_undef; + } +} + +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { + mkldnn::memory::dims dims(arr.shape().ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = arr.shape()[i]; + return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; +} + +inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, + int num_groups = 1) { + if (arr.shape().ndim() == 4 && num_groups == 1) { + return GetMemDesc(arr); + } + else { + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, + (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], + (int) arr.shape()[2], (int) arr.shape()[3]}; + return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; + } +} + +typedef std::shared_ptr mkldnn_mem_ptr; +typedef std::shared_ptr mkldnn_mem_const_ptr; + +inline static mkldnn_mem_ptr CreateMKLDNNMem(const mkldnn::memory::primitive_desc &desc) { + // TODO allocate memory more efficiently. + return std::shared_ptr(new mkldnn::memory(desc)); +} + +inline static std::pair GetWeights( + const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, + int num_groups, std::vector &net) { + mkldnn_mem_const_ptr mem; + auto engine = CpuEngine::Instance().get_engine(); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, + (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } + else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return std::pair(nullptr, nullptr); + } + if (mem->get_primitive_desc() == target_pd) + return std::pair(mem, nullptr); + + std::shared_ptr ret = CreateMKLDNNMem(target_pd); + net.push_back(mkldnn::reorder(*mem, *ret)); + return std::pair(ret, mem); +} + inline static std::shared_ptr GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1) { if (arr.shape().ndim() == 2) { @@ -94,7 +174,7 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; std::vector net; - return arr.GetMKLDNNData(pd, net); + return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], @@ -103,7 +183,7 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; std::vector net; - return arr.GetMKLDNNData(pd, net); + return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, @@ -112,7 +192,7 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; std::vector net; - return arr.GetMKLDNNData(pd, net); + return arr.GetMKLDNNData(pd); } else { LOG(FATAL) << "The weight array has an unsupported number of dimensions"; @@ -120,12 +200,6 @@ inline static std::shared_ptr GetWeights(const NDArray &ar } } -inline static std::shared_ptr CreateMKLDNNMem( - const mkldnn::memory::primitive_desc &desc) { - // TODO allocate memory more efficiently. - return std::shared_ptr(new mkldnn::memory(desc)); -} - } // namespace mxnet #endif #endif // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 7ac0c3a473bd..d485f098d688 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -32,11 +32,13 @@ namespace mxnet { namespace op { static mkldnn::convolution_forward::primitive_desc GetConvFwd( - const ConvolutionParam& param, bool is_train, - const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, - const mkldnn::memory::desc &out_md, const mkldnn::engine &engine, - std::shared_ptr bias_mem) { + const ConvolutionParam& param, bool is_train, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) { auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -47,15 +49,16 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (/*param.dilate.ndim() == 0 &&*/ bias_mem == nullptr) { + if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } else /*if (param.dilate.ndim() == 0)*/ { + auto bias_md = GetMemDesc(*bias); mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, - strides, padding, padding, mkldnn::padding_kind::zero); + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } // else { @@ -81,10 +84,12 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( } static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( - const ConvolutionParam& param, const mkldnn::memory::desc &data_md, - const mkldnn::memory::desc &weights_md, const mkldnn::memory::desc &out_md, - const mkldnn::engine &engine, - const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + const ConvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -97,7 +102,7 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( } // if (param.dilate.ndim() == 0) { mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); // } // else { @@ -115,10 +120,13 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( } static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( - const ConvolutionParam& param, const mkldnn::memory::desc &data_md, - const mkldnn::memory::desc &weights_md, const mkldnn::memory::desc &out_md, - const mkldnn::engine &engine, const mkldnn::convolution_forward::primitive_desc &fwd_pd, - std::shared_ptr bias_mem) { + const ConvolutionParam& param, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -129,15 +137,16 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (/*param.dilate.ndim() == 0 &&*/ bias_mem == nullptr) { + if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weights_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } else /*if (param.dilate.ndim() == 0)*/ { + auto bias_md = GetMemDesc(*bias); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, - strides, padding, padding, mkldnn::padding_kind::zero); + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } // else { @@ -166,31 +175,27 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector &in_data, const std::vector &req, const std::vector &out_data) { const ConvolutionParam& param = nnvm::get(attrs.parsed); - auto data_mem = in_data[conv::kData].GetMKLDNNData(); - auto data_desc = data_mem->get_primitive_desc().desc(); - auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - auto weight_mem = GetWeights(in_data[conv::kWeight], cpu_engine, param.num_group); - auto weight_desc = weight_mem->get_primitive_desc().desc(); - auto out_mem = const_cast(out_data[conv::kOut]).GetMKLDNNData(); - auto out_desc = out_mem->get_primitive_desc().desc(); - + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, + ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], + param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); std::vector net; + printf("src layout: %d\n", fwd_pd.src_primitive_desc().desc().data.format); + printf("weight layout: %d\n", fwd_pd.weights_primitive_desc().desc().data.format); + printf("out layout: %d\n", fwd_pd.dst_primitive_desc().desc().data.format); + auto data_mem = in_data[conv::kData].GetMKLDNNData(fwd_pd.src_primitive_desc(), net); + auto engine = CpuEngine::Instance().get_engine(); + auto weight_data = GetWeights(in_data[conv::kWeight], + fwd_pd.weights_primitive_desc(), param.num_group, net); + auto weight_mem = weight_data.first; + + auto out_mem = const_cast(out_data[conv::kOut]).CreateMKLDNNData( + fwd_pd.dst_primitive_desc()); + if (param.no_bias) { - mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, - ctx.is_train, data_desc, weight_desc, out_desc, cpu_engine, nullptr); - CHECK(fwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - CHECK(fwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - CHECK(fwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, *out_mem)); } else { - auto bias_mem = in_data[conv::kBias].GetMKLDNNData(); - mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, - ctx.is_train, data_desc, weight_desc, out_desc, cpu_engine, bias_mem); - CHECK(fwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - CHECK(fwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - CHECK(fwd_pd.bias_primitive_desc() == bias_mem->get_primitive_desc()); - CHECK(fwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + auto bias_mem = in_data[conv::kBias].GetMKLDNNData(fwd_pd.bias_primitive_desc(), net); net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, *bias_mem, *out_mem)); } @@ -201,30 +206,25 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c const std::vector& inputs, const std::vector& req, const std::vector& outputs) { const std::vector &in_grad = outputs; + auto engine = CpuEngine::Instance().get_engine(); const ConvolutionParam& param = nnvm::get(attrs.parsed); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData(); - auto out_grad_desc = out_grad_mem->get_primitive_desc().desc(); - auto data_mem = inputs[conv::kData + 1].GetMKLDNNData(); - auto data_desc = data_mem->get_primitive_desc().desc(); - auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - auto weight_mem = GetWeights(inputs[conv::kWeight + 1], cpu_engine, - param.num_group); - auto weight_desc = weight_mem->get_primitive_desc().desc(); - std::shared_ptr in_grad_bias; - if (!param.no_bias) - in_grad_bias = const_cast(in_grad[conv::kBias]).GetMKLDNNData(); mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, - data_desc, weight_desc, out_grad_desc, cpu_engine, in_grad_bias); + inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; std::vector net; - std::shared_ptr in_grad_mem, in_grad_weight; + std::shared_ptr in_grad_mem, in_grad_weight, in_grad_bias; + std::pair weight_data; if (req[conv::kData]) { mkldnn::convolution_backward_data::primitive_desc bwdData_pd - = GetConvBwdData(param, data_desc, weight_desc, out_grad_desc, cpu_engine, fwd_pd); - CHECK(bwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); - CHECK(bwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - + = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData( + bwdData_pd.diff_dst_primitive_desc(), net); + weight_data = GetWeights(inputs[conv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group, net); + auto weight_mem = weight_data.first; in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( bwdData_pd.diff_src_primitive_desc()); bool copy_back = false; @@ -239,27 +239,37 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c } if (req[conv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd - = GetConvBwdWeights(param, data_desc, weight_desc, out_grad_desc, - cpu_engine, fwd_pd, in_grad_bias); - CHECK(bwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); - CHECK(bwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData( + bwdWeights_pd.diff_dst_primitive_desc(), net); + auto data_mem = inputs[conv::kData + 1].GetMKLDNNData( + bwdWeights_pd.src_primitive_desc(), net); in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( bwdWeights_pd.diff_weights_primitive_desc()); - bool copy_back = false; + bool copy_back_weight = false; + bool copy_back_bias = false; if (in_grad_weight == nullptr) { in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); - copy_back = true; + copy_back_weight = true; } if (param.no_bias) { net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); } else { + in_grad_bias = const_cast(in_grad[conv::kBias]).CreateMKLDNNData( + bwdWeights_pd.diff_bias_primitive_desc()); + if (in_grad_bias == nullptr) { + in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); + copy_back_bias = true; + } net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } - if (copy_back) { + if (copy_back_weight) const_cast(in_grad[conv::kWeight]).CopyFrom(*in_grad_weight, net); - } + if (copy_back_bias) + const_cast(in_grad[conv::kBias]).CopyFrom(*in_grad_bias, net); } mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } From 3bdc8717c8faf0b3369cad7c773bb80902d32dd4 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 19:31:34 +0000 Subject: [PATCH 057/264] Create output MKLDNN memory explicitly for FC. --- .../nn/mkldnn/mkldnn_fully_connected.cc | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 49419f7c1fc3..2b9d217c4fa8 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -73,7 +73,6 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto data_mem = in_data[fullc::kData].GetMKLDNNData(); auto data_desc = data_mem->get_primitive_desc().desc(); auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - CHECK_EQ(in_data[fullc::kWeight + 1].shape().ndim(), 2); auto weight_mem = GetWeights(in_data[fullc::kWeight], cpu_engine); auto weight_desc = weight_mem->get_primitive_desc().desc(); auto out_mem = const_cast(out_data[fullc::kOut]).GetMKLDNNData(); @@ -112,7 +111,6 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto data_mem = inputs[fullc::kData + 1].GetMKLDNNData(); auto data_desc = data_mem->get_primitive_desc().desc(); auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - CHECK_EQ(inputs[fullc::kWeight + 1].shape().ndim(), 2); auto weight_mem = GetWeights(inputs[fullc::kWeight + 1], cpu_engine); auto weight_desc = weight_mem->get_primitive_desc().desc(); std::shared_ptr in_grad_bias; @@ -123,6 +121,7 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; std::vector net; + mkldnn_mem_ptr in_grad_mem, in_grad_weight; if (req[fullc::kData]) { mkldnn::inner_product_backward_data::desc ipBwdData_desc(data_desc, weight_desc, out_grad_desc); @@ -130,18 +129,30 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, cpu_engine, ipFwd_pd); CHECK(ipBwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); CHECK(ipBwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - auto in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( + in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( ipBwdData_pd.diff_src_primitive_desc()); + bool copy_back = false; + if (in_grad_mem == nullptr) { + in_grad_mem = CreateMKLDNNMem(ipBwdData_pd.diff_src_primitive_desc()); + copy_back = true; + } net.push_back(mkldnn::inner_product_backward_data(ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem)); + if (copy_back) + const_cast(in_grad[fullc::kData]).CopyFrom(*in_grad_mem, net); } if (req[fullc::kWeight]) { mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetIPBwd( data_desc, weight_desc, out_grad_desc, cpu_engine, ipFwd_pd, in_grad_bias); CHECK(ipBwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); CHECK(ipBwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - auto in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( + in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( ipBwdWeights_pd.diff_weights_primitive_desc()); + bool copy_back_weight = false; + if (in_grad_weight == nullptr) { + in_grad_weight = CreateMKLDNNMem(ipBwdWeights_pd.diff_weights_primitive_desc()); + copy_back_weight = true; + } if (param.no_bias) { net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); @@ -149,6 +160,8 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } + if (copy_back_weight) + const_cast(in_grad[fullc::kWeight]).CopyFrom(*in_grad_weight, net); } mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } From 551c66d9348cb577ed88767970961394e38bab4f Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 23:44:15 +0000 Subject: [PATCH 058/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 3ef4e5d74976..4f73a2ef02f9 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -257,9 +257,20 @@ void NDArray::set_fresh_out_grad(bool state) const { } #if MXNET_USE_MKLDNN == 1 +static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { + if (shape.ndim() != ndims) + return false; + for (int i = 0; i < ndims; i++) + if (shape[i] != dims[i]) + return false; + return true; +} + void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - if (Mkl_mem_) + if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc().desc().data.dims, + Mkl_mem_->get_primitive_desc().desc().data.ndims)) { return; + } mkldnn::memory::dims dims(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) @@ -299,8 +310,10 @@ std::shared_ptr NDArray::GetMKLDNNData( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } - if (ptr_->Mkl_mem_) + if (ptr_->Mkl_mem_) { + CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); return ptr_->Mkl_mem_; + } return std::shared_ptr(new mkldnn::memory(desc, ptr_->shandle.dptr)); } From b6abf35be01f65373c4cbd6a3191f58f6177af2b Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 23:45:14 +0000 Subject: [PATCH 059/264] Fix a bug in GetWeightDesc. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 99431887fa11..dd1475cec9c0 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -104,7 +104,7 @@ inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, int num_groups = 1) { - if (arr.shape().ndim() == 4 && num_groups == 1) { + if (num_groups == 1) { return GetMemDesc(arr); } else { From dff9c0ff7c52545c0fc1eb3fb5acdb477b8ac42e Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 23:46:29 +0000 Subject: [PATCH 060/264] Convert data layout if necessary in FC. --- .../nn/mkldnn/mkldnn_fully_connected.cc | 138 ++++++++++-------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 2b9d217c4fa8..3d3ef4689835 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -31,36 +31,53 @@ namespace mxnet { namespace op { inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( - const mkldnn::memory::desc &data_desc, const mkldnn::memory::desc &weight_desc, - const mkldnn::memory::desc &out_desc, const mkldnn::engine &engine, - std::shared_ptr bias_mem) { - if (bias_mem) { - auto bias_desc = bias_mem->get_primitive_desc().desc(); + const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, - data_desc, weight_desc, bias_desc, out_desc); + data_md, weight_md, bias_md, out_md); return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); } else { mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, - data_desc, weight_desc, out_desc); + data_md, weight_md, out_md); return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); } } -inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwd( - const mkldnn::memory::desc &data_desc, const mkldnn::memory::desc &weight_desc, - const mkldnn::memory::desc &out_desc, const mkldnn::engine &engine, - mkldnn::inner_product_forward::primitive_desc ipFwd_pd, - std::shared_ptr bias_mem) { - if (bias_mem) { - mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_desc, - weight_desc, bias_mem->get_primitive_desc().desc(), out_desc); +inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( + const NDArray &data, const NDArray &weight, const NDArray &output, + mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); + return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd); +} + +inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights( + const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, bias_md, out_md); return mkldnn::inner_product_backward_weights::primitive_desc( ipBwdWeights_desc, engine, ipFwd_pd); } else { - mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_desc, - weight_desc, out_desc); + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, out_md); return mkldnn::inner_product_backward_weights::primitive_desc( ipBwdWeights_desc, engine, ipFwd_pd); } @@ -70,34 +87,30 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); - auto data_mem = in_data[fullc::kData].GetMKLDNNData(); - auto data_desc = data_mem->get_primitive_desc().desc(); - auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - auto weight_mem = GetWeights(in_data[fullc::kWeight], cpu_engine); - auto weight_desc = weight_mem->get_primitive_desc().desc(); - auto out_mem = const_cast(out_data[fullc::kOut]).GetMKLDNNData(); - auto out_desc = out_mem->get_primitive_desc().desc(); - std::vector net; + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( + in_data[fullc::kData], in_data[fullc::kWeight], + param.no_bias ? nullptr : &in_data[fullc::kBias], out_data[fullc::kOut]); + auto data_mem = in_data[fullc::kData].GetMKLDNNData(ipFwd_pd.src_primitive_desc(), net); + auto weight_mem = in_data[fullc::kWeight].GetMKLDNNData( + ipFwd_pd.weights_primitive_desc(), net); + auto out_mem = const_cast(out_data[fullc::kOut]).CreateMKLDNNData( + ipFwd_pd.dst_primitive_desc()); + bool copy_back = false; + if (out_mem == nullptr) { + out_mem = CreateMKLDNNMem(ipFwd_pd.dst_primitive_desc()); + copy_back = true; + } if (param.no_bias) { - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( - data_desc, weight_desc, out_desc, cpu_engine, nullptr); - CHECK(ipFwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - CHECK(ipFwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - CHECK(ipFwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, *out_mem)); } else { - auto bias_mem = in_data[fullc::kBias].GetMKLDNNData(); - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( - data_desc, weight_desc, out_desc, cpu_engine, bias_mem); - CHECK(ipFwd_pd.src_primitive_desc() == data_mem->get_primitive_desc()); - CHECK(ipFwd_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); - CHECK(ipFwd_pd.bias_primitive_desc() == bias_mem->get_primitive_desc()); - CHECK(ipFwd_pd.dst_primitive_desc() == out_mem->get_primitive_desc()); + auto bias_mem = in_data[fullc::kBias].GetMKLDNNData(ipFwd_pd.bias_primitive_desc(), net); net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, *bias_mem, *out_mem)); } + if (copy_back) + const_cast(out_data[fullc::kOut]).CopyFrom(*out_mem, net); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } @@ -106,29 +119,21 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &outputs) { const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); - auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData(); - auto out_grad_desc = out_grad_mem->get_primitive_desc().desc(); - auto data_mem = inputs[fullc::kData + 1].GetMKLDNNData(); - auto data_desc = data_mem->get_primitive_desc().desc(); - auto cpu_engine = data_mem->get_primitive_desc().get_engine(); - auto weight_mem = GetWeights(inputs[fullc::kWeight + 1], cpu_engine); - auto weight_desc = weight_mem->get_primitive_desc().desc(); - std::shared_ptr in_grad_bias; - if (!param.no_bias) - in_grad_bias = const_cast(in_grad[fullc::kBias]).GetMKLDNNData(); - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data_desc, - weight_desc, out_grad_desc, cpu_engine, in_grad_bias); + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( + inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], + param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut]); CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; std::vector net; - mkldnn_mem_ptr in_grad_mem, in_grad_weight; + mkldnn_mem_ptr in_grad_mem, in_grad_weight, in_grad_bias; if (req[fullc::kData]) { - mkldnn::inner_product_backward_data::desc ipBwdData_desc(data_desc, weight_desc, - out_grad_desc); - mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd(ipBwdData_desc, - cpu_engine, ipFwd_pd); - CHECK(ipBwdData_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); - CHECK(ipBwdData_pd.weights_primitive_desc() == weight_mem->get_primitive_desc()); + mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( + inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], inputs[fullc::kOut], + ipFwd_pd); + auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData( + ipBwdData_pd.diff_dst_primitive_desc(), net); + auto weight_mem = inputs[fullc::kWeight + 1].GetMKLDNNData( + ipBwdData_pd.weights_primitive_desc(), net); in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( ipBwdData_pd.diff_src_primitive_desc()); bool copy_back = false; @@ -142,13 +147,18 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const_cast(in_grad[fullc::kData]).CopyFrom(*in_grad_mem, net); } if (req[fullc::kWeight]) { - mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetIPBwd( - data_desc, weight_desc, out_grad_desc, cpu_engine, ipFwd_pd, in_grad_bias); - CHECK(ipBwdWeights_pd.diff_dst_primitive_desc() == out_grad_mem->get_primitive_desc()); - CHECK(ipBwdWeights_pd.src_primitive_desc() == data_mem->get_primitive_desc()); + mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd + = GetIPBwdWeights(inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], + param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut], + ipFwd_pd); + auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData( + ipBwdWeights_pd.diff_dst_primitive_desc(), net); + auto data_mem = inputs[fullc::kData + 1].GetMKLDNNData( + ipBwdWeights_pd.src_primitive_desc(), net); in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( ipBwdWeights_pd.diff_weights_primitive_desc()); bool copy_back_weight = false; + bool copy_back_bias = false; if (in_grad_weight == nullptr) { in_grad_weight = CreateMKLDNNMem(ipBwdWeights_pd.diff_weights_primitive_desc()); copy_back_weight = true; @@ -157,11 +167,19 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); } else { + in_grad_bias = const_cast(in_grad[fullc::kBias]).CreateMKLDNNData( + ipBwdWeights_pd.diff_bias_primitive_desc()); + if (in_grad_bias == nullptr) { + in_grad_bias = CreateMKLDNNMem(ipBwdWeights_pd.diff_bias_primitive_desc()); + copy_back_bias = true; + } net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } if (copy_back_weight) const_cast(in_grad[fullc::kWeight]).CopyFrom(*in_grad_weight, net); + if (copy_back_bias) + const_cast(in_grad[fullc::kBias]).CopyFrom(*in_grad_bias, net); } mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } From 310be84e1669a4e013c842728da090e166e4831b Mon Sep 17 00:00:00 2001 From: Da zheng Date: Fri, 3 Nov 2017 23:47:17 +0000 Subject: [PATCH 061/264] remove unnecessary print in MKLDNN convolution. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index d485f098d688..55f8bbeed35d 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -179,9 +179,6 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); std::vector net; - printf("src layout: %d\n", fwd_pd.src_primitive_desc().desc().data.format); - printf("weight layout: %d\n", fwd_pd.weights_primitive_desc().desc().data.format); - printf("out layout: %d\n", fwd_pd.dst_primitive_desc().desc().data.format); auto data_mem = in_data[conv::kData].GetMKLDNNData(fwd_pd.src_primitive_desc(), net); auto engine = CpuEngine::Instance().get_engine(); auto weight_data = GetWeights(in_data[conv::kWeight], From b1f17c5c20bb9bdb21c86ed092033e36f91c934a Mon Sep 17 00:00:00 2001 From: Da zheng Date: Thu, 2 Nov 2017 20:02:23 +0000 Subject: [PATCH 062/264] Add MKLDNN deconvolution. --- src/operator/nn/deconvolution.cc | 94 +++++- .../nn/mkldnn/mkldnn_deconvolution.cc | 283 ++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_ops-inl.h | 8 + 3 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_deconvolution.cc diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 3dd3f9f013a0..13642e643342 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -25,6 +25,7 @@ */ #include "./deconvolution-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -254,6 +255,93 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, return true; } +inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 3 : 4; + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), out_expected); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNDeconvolution_Forward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + DeconvolutionCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNDeconvolution_Backward(attrs, ctx, inputs, req, outputs); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + DeconvolutionGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) { using namespace mshadow; DeconvolutionParam param_; @@ -312,10 +400,12 @@ NNVM_REGISTER_OP(Deconvolution) }) .set_attr("FInferShape", DeconvolutionShape) .set_attr("FInferType", DeconvolutionType) +.set_attr("FInferStorageType", DeconvStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) .set_attr("FCompute", DeconvolutionCompute) +.set_attr("FComputeEx", DeconvolutionCompute_CPU) .set_attr("FGradient", DeconvolutionGrad{"_backward_Deconvolution"}) .add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") .add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") @@ -329,11 +419,13 @@ NNVM_REGISTER_OP(_backward_Deconvolution) return params.no_bias ? 2 : 3; }) .set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_DeconvStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) .set_attr_parser(DeconvolutionParamParser) -.set_attr("FCompute", DeconvolutionGradCompute); +.set_attr("FCompute", DeconvolutionGradCompute) +.set_attr("FComputeEx", DeconvolutionGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc new file mode 100644 index 000000000000..31c91f4c7373 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_deconvolution.cc + * \brief + * \author Da Zheng +*/ + +#include "../deconvolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( + const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, + const mkldnn::memory::desc *bias_md, const mkldnn::memory::desc &out_md, + const mkldnn::engine &engine, const mkldnn::memory::dims &strides, + const mkldnn::memory::dims &padding) { + // TODO when dilate > 1 + if (bias_md == nullptr) { + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, + padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else { + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, + *bias_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } +} + +static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (bias) { + auto bias_md = GetMemDesc(*bias); + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, &bias_md, + out_md, engine, strides, padding); + // TODO when dilate > 1 + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); + } + else { + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, nullptr, out_md, engine, + strides, padding); + // TODO when dilate > 1 + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); + } +} + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( + const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + // TODO dilate + if (bias) { + auto bias_md = GetMemDesc(*bias); + return GetDeconvBwd_(data_md, weight_md, &bias_md, out_md, + engine, strides, padding); + } + else + return GetDeconvBwd_(data_md, weight_md, nullptr, out_md, + engine, strides, padding); +} + +static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Instance().get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else /*if (param.dilate.ndim() == 0)*/ { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, bias_md, data_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } +// else { +// // TODO I should test the case with dilate. +// mkldnn::memory::dims dilates{0, 0}; +// if (param.dilate.ndim() == 2) { +// dilates[0] = param.dilate[0]; +// dilates[1] = param.dilate[1]; +// } +// if (bias_mem == nullptr) { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, out_md, strides, dilates, padding, padding, +// mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// else { +// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, +// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, +// strides, dilates, padding, padding, mkldnn::padding_kind::zero); +// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); +// } +// } +} + +void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + + std::vector net; + mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( + param, in_data[deconv::kData], in_data[deconv::kWeight], + param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); + auto data_mem = in_data[deconv::kData].GetMKLDNNData( + deconvFwd_pd.diff_src_primitive_desc(), net); + auto weight_data = GetWeights(in_data[deconv::kWeight], + deconvFwd_pd.weights_primitive_desc(), param.num_group, net); + auto weight_mem = weight_data.first; + auto out_mem = const_cast(out_data[deconv::kOut]).CreateMKLDNNData( + deconvFwd_pd.diff_dst_primitive_desc()); + bool copy_back = false; + if (out_mem == nullptr) { + out_mem = CreateMKLDNNMem(deconvFwd_pd.diff_dst_primitive_desc()); + copy_back = true; + } + + net.push_back(mkldnn::convolution_backward_data(deconvFwd_pd, *data_mem, *weight_mem, + *out_mem)); + if (copy_back) + const_cast(out_data[deconv::kOut]).CopyFrom(*out_mem, net); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + if (!param.no_bias) { + // add bias, broadcast bias to dim 1: channel + // TODO this is problematic if the layout isn't expected. + // we need to handle the type correctly. + typedef float DType; + Stream *s = ctx.get_stream(); + Tensor bias = in_data[deconv::kBias].data().get(s); + Tensor out_cpu = out_data[deconv::kOut].data().get(s); + out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_); + } +} + +void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { + const std::vector &in_grad = outputs; + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; + std::vector net; + mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( + param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], nullptr, + inputs[deconv::kOut]); + std::shared_ptr in_grad_mem, in_grad_weight, in_grad_bias; + std::pair weight_data; + if (req[deconv::kData]) { + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNData( + bwdData_pd.src_primitive_desc(), net); + weight_data = GetWeights(inputs[deconv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group, net); + auto weight_mem = weight_data.first; + in_grad_mem = const_cast(in_grad[deconv::kData]).CreateMKLDNNData( + bwdData_pd.dst_primitive_desc()); + bool copy_back = false; + if (in_grad_mem == nullptr) { + in_grad_mem = CreateMKLDNNMem(bwdData_pd.dst_primitive_desc()); + copy_back = true; + } + net.push_back(mkldnn::convolution_forward(bwdData_pd, *out_grad_mem, + *weight_mem, *in_grad_mem)); + if (copy_back) + const_cast(in_grad[deconv::kData]).CopyFrom(*in_grad_mem, net); + } + if (req[deconv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], + inputs[deconv::kWeight + 1], + param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], + inputs[deconv::kOut], bwdData_pd); + CHECK_NE(req[deconv::kWeight], kAddTo); + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNData( + bwdWeights_pd.diff_dst_primitive_desc(), net); + auto data_mem = inputs[deconv::kData + 1].GetMKLDNNData( + bwdWeights_pd.src_primitive_desc(), net); + in_grad_weight = const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData( + bwdWeights_pd.diff_weights_primitive_desc()); + bool copy_back_weight = false; + bool copy_back_bias = false; + if (in_grad_weight == nullptr) { + in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); + copy_back_weight = true; + } + if (param.no_bias) { + net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, + *out_grad_mem, *data_mem, *in_grad_weight)); + } else { + in_grad_bias = const_cast(in_grad[deconv::kBias]).CreateMKLDNNData( + bwdWeights_pd.diff_bias_primitive_desc()); + if (in_grad_bias == nullptr) { + in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); + copy_back_bias = true; + } + net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, + *out_grad_mem, *data_mem, *in_grad_weight, *in_grad_bias)); + } + if (copy_back_weight) + const_cast(in_grad[deconv::kWeight]).CopyFrom(*in_grad_weight, net); + if (copy_back_bias) + const_cast(in_grad[deconv::kBias]).CopyFrom(*in_grad_bias, net); + } + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); +} + +} +} + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index e2c8b986e407..710e439515f8 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -55,6 +55,14 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c const std::vector& inputs, const std::vector& req, const std::vector& outputs); +/* For deconvolution */ +void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + } } #endif // MXNET_USE_MKLDNN == 1 From 926289ccffe636cdf87cc61b38da9042be7b01d0 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Mon, 6 Nov 2017 18:06:51 +0000 Subject: [PATCH 063/264] Add MKLDNNStream to manage primitives and memories. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 47 ++++++++++++++++++------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index dd1475cec9c0..733980ef54e8 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -119,14 +119,40 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, typedef std::shared_ptr mkldnn_mem_ptr; typedef std::shared_ptr mkldnn_mem_const_ptr; +class MKLDNNStream { + std::vector net; + // Here we hold all memory related to the operators in the stream. + std::vector mem_holder; +public: + static MKLDNNStream &Instance() { + static thread_local MKLDNNStream stream; + return stream; + } + + void RegisterPrim(const mkldnn::primitive &prim) { + net.push_back(prim); + } + + void RegisterMem(mkldnn_mem_const_ptr mem) { + mem_holder.push_back(mem); + } + + void Submit() { + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + net.clear(); + mem_holder.clear(); + } +}; + inline static mkldnn_mem_ptr CreateMKLDNNMem(const mkldnn::memory::primitive_desc &desc) { // TODO allocate memory more efficiently. - return std::shared_ptr(new mkldnn::memory(desc)); + std::shared_ptr ret(new mkldnn::memory(desc)); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; } -inline static std::pair GetWeights( - const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups, std::vector &net) { +inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, int num_groups) { mkldnn_mem_const_ptr mem; auto engine = CpuEngine::Instance().get_engine(); if (arr.shape().ndim() == 2) { @@ -155,17 +181,17 @@ inline static std::pair GetWeights( } else { LOG(FATAL) << "The weight array has an unsupported number of dimensions"; - return std::pair(nullptr, nullptr); + return nullptr; } if (mem->get_primitive_desc() == target_pd) - return std::pair(mem, nullptr); + return mem; std::shared_ptr ret = CreateMKLDNNMem(target_pd); - net.push_back(mkldnn::reorder(*mem, *ret)); - return std::pair(ret, mem); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; } -inline static std::shared_ptr GetWeights(const NDArray &arr, +inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1) { if (arr.shape().ndim() == 2) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], @@ -173,7 +199,6 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; - std::vector net; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { @@ -182,7 +207,6 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; - std::vector net; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { @@ -191,7 +215,6 @@ inline static std::shared_ptr GetWeights(const NDArray &ar mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; - std::vector net; return arr.GetMKLDNNData(pd); } else { From fae1fc3e18f2526e724de21c81ba80e484d26379 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Mon, 6 Nov 2017 18:08:19 +0000 Subject: [PATCH 064/264] Use MKLDNNStream to register memory in NDArray. --- include/mxnet/ndarray.h | 7 +++---- src/ndarray/ndarray.cc | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 5cf9f50256af..8d51858f774f 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -566,11 +566,10 @@ class NDArray { * The returned mkldnn::memory will have the same physical layout as * the given primitive_desc. */ - std::shared_ptr GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc, - std::vector &net) const; + std::shared_ptr GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const; - void CopyFrom(const mkldnn::memory &mem, std::vector &net); + void CopyFrom(const mkldnn::memory &mem); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); #endif diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 4f73a2ef02f9..369c96a6ce19 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -314,13 +314,13 @@ std::shared_ptr NDArray::GetMKLDNNData( CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); return ptr_->Mkl_mem_; } - return std::shared_ptr(new mkldnn::memory(desc, - ptr_->shandle.dptr)); + mkldnn_mem_const_ptr ret(new mkldnn::memory(desc, ptr_->shandle.dptr)); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; } -std::shared_ptr NDArray::GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc, - std::vector &net) const { +std::shared_ptr NDArray::GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const { if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; @@ -332,8 +332,10 @@ std::shared_ptr NDArray::GetMKLDNNData( return ptr_->Mkl_mem_; else { // TODO we should manage the memory allocation here. - std::shared_ptr ret(new mkldnn::memory(desc)); - net.push_back(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); + mkldnn_mem_ptr ret(new mkldnn::memory(desc)); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterMem(ret); + stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; } } @@ -347,14 +349,13 @@ std::shared_ptr NDArray::GetMKLDNNData() const { return nullptr; } -void NDArray::CopyFrom(const mkldnn::memory &mem, - std::vector &net) { +void NDArray::CopyFrom(const mkldnn::memory &mem) { if (ptr_ == nullptr) { LOG(FATAL) << "The NDArray hasn't been initialized"; return; } ptr_->SetMKLMem(shape_, dtype_); - net.push_back(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); } std::shared_ptr NDArray::CreateMKLDNNData( @@ -370,8 +371,7 @@ std::shared_ptr NDArray::CreateMKLDNNData( if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) return ptr_->Mkl_mem_; - // TODO we should manage the memory allocation here. - ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); + ptr_->Mkl_mem_ = CreateMKLDNNMem(desc); return ptr_->Mkl_mem_; } #endif From 474f847ac839077020fdf22965d323f7e6c728ae Mon Sep 17 00:00:00 2001 From: Da zheng Date: Mon, 6 Nov 2017 18:09:45 +0000 Subject: [PATCH 065/264] Use MKLDNNStream to manage resources in operators. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 65 +++++++++---------- .../nn/mkldnn/mkldnn_deconvolution.cc | 63 +++++++++--------- .../nn/mkldnn/mkldnn_fully_connected.cc | 64 +++++++++--------- src/operator/nn/mkldnn/mkldnn_relu-inl.h | 12 ++-- 4 files changed, 96 insertions(+), 108 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 55f8bbeed35d..28ee1874d6d8 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -178,25 +178,23 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); - std::vector net; - auto data_mem = in_data[conv::kData].GetMKLDNNData(fwd_pd.src_primitive_desc(), net); + auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd_pd.src_primitive_desc()); auto engine = CpuEngine::Instance().get_engine(); - auto weight_data = GetWeights(in_data[conv::kWeight], - fwd_pd.weights_primitive_desc(), param.num_group, net); - auto weight_mem = weight_data.first; + auto weight_mem = GetWeights(in_data[conv::kWeight], + fwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = const_cast(out_data[conv::kOut]).CreateMKLDNNData( fwd_pd.dst_primitive_desc()); if (param.no_bias) { - net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, - *out_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + *data_mem, *weight_mem, *out_mem)); } else { - auto bias_mem = in_data[conv::kBias].GetMKLDNNData(fwd_pd.bias_primitive_desc(), net); - net.push_back(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, - *bias_mem, *out_mem)); + auto bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd_pd.bias_primitive_desc()); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + *data_mem, *weight_mem, *bias_mem, *out_mem)); } - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + MKLDNNStream::Instance().Submit(); } void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -210,39 +208,35 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; - std::vector net; - std::shared_ptr in_grad_mem, in_grad_weight, in_grad_bias; - std::pair weight_data; if (req[conv::kData]) { mkldnn::convolution_backward_data::primitive_desc bwdData_pd = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], inputs[conv::kOut], fwd_pd); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData( - bwdData_pd.diff_dst_primitive_desc(), net); - weight_data = GetWeights(inputs[conv::kWeight + 1], - bwdData_pd.weights_primitive_desc(), param.num_group, net); - auto weight_mem = weight_data.first; - in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdData_pd.diff_dst_primitive_desc()); + auto weight_mem = GetWeights(inputs[conv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group); + auto in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( bwdData_pd.diff_src_primitive_desc()); bool copy_back = false; if (in_grad_mem == nullptr) { in_grad_mem = CreateMKLDNNMem(bwdData_pd.diff_src_primitive_desc()); copy_back = true; } - net.push_back(mkldnn::convolution_backward_data(bwdData_pd, *out_grad_mem, - *weight_mem, *in_grad_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem)); if (copy_back) - const_cast(in_grad[conv::kData]).CopyFrom(*in_grad_mem, net); + const_cast(in_grad[conv::kData]).CopyFrom(*in_grad_mem); } if (req[conv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNData( - bwdWeights_pd.diff_dst_primitive_desc(), net); - auto data_mem = inputs[conv::kData + 1].GetMKLDNNData( - bwdWeights_pd.src_primitive_desc(), net); - in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( bwdWeights_pd.diff_weights_primitive_desc()); bool copy_back_weight = false; bool copy_back_bias = false; @@ -250,9 +244,10 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); copy_back_weight = true; } + mkldnn_mem_const_ptr in_grad_bias; if (param.no_bias) { - net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, - *data_mem, *out_grad_mem, *in_grad_weight)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); } else { in_grad_bias = const_cast(in_grad[conv::kBias]).CreateMKLDNNData( bwdWeights_pd.diff_bias_primitive_desc()); @@ -260,15 +255,15 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); copy_back_bias = true; } - net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, - *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } if (copy_back_weight) - const_cast(in_grad[conv::kWeight]).CopyFrom(*in_grad_weight, net); + const_cast(in_grad[conv::kWeight]).CopyFrom(*in_grad_weight); if (copy_back_bias) - const_cast(in_grad[conv::kBias]).CopyFrom(*in_grad_bias, net); + const_cast(in_grad[conv::kBias]).CopyFrom(*in_grad_bias); } - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + MKLDNNStream::Instance().Submit(); } } diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 31c91f4c7373..f8675b637f62 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -172,15 +172,13 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & const std::vector &out_data) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); - std::vector net; mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( param, in_data[deconv::kData], in_data[deconv::kWeight], param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); - auto data_mem = in_data[deconv::kData].GetMKLDNNData( - deconvFwd_pd.diff_src_primitive_desc(), net); - auto weight_data = GetWeights(in_data[deconv::kWeight], - deconvFwd_pd.weights_primitive_desc(), param.num_group, net); - auto weight_mem = weight_data.first; + auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( + deconvFwd_pd.diff_src_primitive_desc()); + auto weight_mem = GetWeights(in_data[deconv::kWeight], + deconvFwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = const_cast(out_data[deconv::kOut]).CreateMKLDNNData( deconvFwd_pd.diff_dst_primitive_desc()); bool copy_back = false; @@ -189,11 +187,11 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & copy_back = true; } - net.push_back(mkldnn::convolution_backward_data(deconvFwd_pd, *data_mem, *weight_mem, - *out_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data( + deconvFwd_pd, *data_mem, *weight_mem, *out_mem)); if (copy_back) - const_cast(out_data[deconv::kOut]).CopyFrom(*out_mem, net); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + const_cast(out_data[deconv::kOut]).CopyFrom(*out_mem); + MKLDNNStream::Instance().Submit(); if (!param.no_bias) { // add bias, broadcast bias to dim 1: channel // TODO this is problematic if the layout isn't expected. @@ -213,29 +211,25 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext const DeconvolutionParam& param = nnvm::get(attrs.parsed); CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; - std::vector net; mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], nullptr, inputs[deconv::kOut]); - std::shared_ptr in_grad_mem, in_grad_weight, in_grad_bias; - std::pair weight_data; if (req[deconv::kData]) { - auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNData( - bwdData_pd.src_primitive_desc(), net); - weight_data = GetWeights(inputs[deconv::kWeight + 1], - bwdData_pd.weights_primitive_desc(), param.num_group, net); - auto weight_mem = weight_data.first; - in_grad_mem = const_cast(in_grad[deconv::kData]).CreateMKLDNNData( + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdData_pd.src_primitive_desc()); + auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group); + auto in_grad_mem = const_cast(in_grad[deconv::kData]).CreateMKLDNNData( bwdData_pd.dst_primitive_desc()); bool copy_back = false; if (in_grad_mem == nullptr) { in_grad_mem = CreateMKLDNNMem(bwdData_pd.dst_primitive_desc()); copy_back = true; } - net.push_back(mkldnn::convolution_forward(bwdData_pd, *out_grad_mem, - *weight_mem, *in_grad_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem)); if (copy_back) - const_cast(in_grad[deconv::kData]).CopyFrom(*in_grad_mem, net); + const_cast(in_grad[deconv::kData]).CopyFrom(*in_grad_mem); } if (req[deconv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd @@ -244,11 +238,11 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], inputs[deconv::kOut], bwdData_pd); CHECK_NE(req[deconv::kWeight], kAddTo); - auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNData( - bwdWeights_pd.diff_dst_primitive_desc(), net); - auto data_mem = inputs[deconv::kData + 1].GetMKLDNNData( - bwdWeights_pd.src_primitive_desc(), net); - in_grad_weight = const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData( + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData( bwdWeights_pd.diff_weights_primitive_desc()); bool copy_back_weight = false; bool copy_back_bias = false; @@ -256,9 +250,10 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); copy_back_weight = true; } + mkldnn_mem_const_ptr in_grad_bias; if (param.no_bias) { - net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, - *out_grad_mem, *data_mem, *in_grad_weight)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight)); } else { in_grad_bias = const_cast(in_grad[deconv::kBias]).CreateMKLDNNData( bwdWeights_pd.diff_bias_primitive_desc()); @@ -266,15 +261,15 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); copy_back_bias = true; } - net.push_back(mkldnn::convolution_backward_weights(bwdWeights_pd, - *out_grad_mem, *data_mem, *in_grad_weight, *in_grad_bias)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight, *in_grad_bias)); } if (copy_back_weight) - const_cast(in_grad[deconv::kWeight]).CopyFrom(*in_grad_weight, net); + const_cast(in_grad[deconv::kWeight]).CopyFrom(*in_grad_weight); if (copy_back_bias) - const_cast(in_grad[deconv::kBias]).CopyFrom(*in_grad_bias, net); + const_cast(in_grad[deconv::kBias]).CopyFrom(*in_grad_bias); } - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + MKLDNNStream::Instance().Submit(); } } diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 3d3ef4689835..6e73fd50f95d 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -87,13 +87,12 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); - std::vector net; mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( in_data[fullc::kData], in_data[fullc::kWeight], param.no_bias ? nullptr : &in_data[fullc::kBias], out_data[fullc::kOut]); - auto data_mem = in_data[fullc::kData].GetMKLDNNData(ipFwd_pd.src_primitive_desc(), net); - auto weight_mem = in_data[fullc::kWeight].GetMKLDNNData( - ipFwd_pd.weights_primitive_desc(), net); + auto data_mem = in_data[fullc::kData].GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); + auto weight_mem = in_data[fullc::kWeight].GetMKLDNNDataReorder( + ipFwd_pd.weights_primitive_desc()); auto out_mem = const_cast(out_data[fullc::kOut]).CreateMKLDNNData( ipFwd_pd.dst_primitive_desc()); bool copy_back = false; @@ -102,16 +101,16 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, copy_back = true; } if (param.no_bias) { - net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, - *out_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward( + ipFwd_pd, *data_mem, *weight_mem, *out_mem)); } else { - auto bias_mem = in_data[fullc::kBias].GetMKLDNNData(ipFwd_pd.bias_primitive_desc(), net); - net.push_back(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, - *bias_mem, *out_mem)); + auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, + *data_mem, *weight_mem, *bias_mem, *out_mem)); } if (copy_back) - const_cast(out_data[fullc::kOut]).CopyFrom(*out_mem, net); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + const_cast(out_data[fullc::kOut]).CopyFrom(*out_mem); + MKLDNNStream::Instance().Submit(); } void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -124,38 +123,36 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut]); CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; - std::vector net; - mkldnn_mem_ptr in_grad_mem, in_grad_weight, in_grad_bias; if (req[fullc::kData]) { mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], inputs[fullc::kOut], ipFwd_pd); - auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData( - ipBwdData_pd.diff_dst_primitive_desc(), net); - auto weight_mem = inputs[fullc::kWeight + 1].GetMKLDNNData( - ipBwdData_pd.weights_primitive_desc(), net); - in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( + auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNDataReorder( + ipBwdData_pd.diff_dst_primitive_desc()); + auto weight_mem = inputs[fullc::kWeight + 1].GetMKLDNNDataReorder( + ipBwdData_pd.weights_primitive_desc()); + auto in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( ipBwdData_pd.diff_src_primitive_desc()); bool copy_back = false; if (in_grad_mem == nullptr) { in_grad_mem = CreateMKLDNNMem(ipBwdData_pd.diff_src_primitive_desc()); copy_back = true; } - net.push_back(mkldnn::inner_product_backward_data(ipBwdData_pd, *out_grad_mem, - *weight_mem, *in_grad_mem)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_data( + ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem)); if (copy_back) - const_cast(in_grad[fullc::kData]).CopyFrom(*in_grad_mem, net); + const_cast(in_grad[fullc::kData]).CopyFrom(*in_grad_mem); } if (req[fullc::kWeight]) { mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetIPBwdWeights(inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut], ipFwd_pd); - auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNData( - ipBwdWeights_pd.diff_dst_primitive_desc(), net); - auto data_mem = inputs[fullc::kData + 1].GetMKLDNNData( - ipBwdWeights_pd.src_primitive_desc(), net); - in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( + auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNDataReorder( + ipBwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = inputs[fullc::kData + 1].GetMKLDNNDataReorder( + ipBwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( ipBwdWeights_pd.diff_weights_primitive_desc()); bool copy_back_weight = false; bool copy_back_bias = false; @@ -163,9 +160,10 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, in_grad_weight = CreateMKLDNNMem(ipBwdWeights_pd.diff_weights_primitive_desc()); copy_back_weight = true; } + mkldnn_mem_const_ptr in_grad_bias; if (param.no_bias) { - net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, - *data_mem, *out_grad_mem, *in_grad_weight)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); } else { in_grad_bias = const_cast(in_grad[fullc::kBias]).CreateMKLDNNData( ipBwdWeights_pd.diff_bias_primitive_desc()); @@ -173,15 +171,15 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, in_grad_bias = CreateMKLDNNMem(ipBwdWeights_pd.diff_bias_primitive_desc()); copy_back_bias = true; } - net.push_back(mkldnn::inner_product_backward_weights(ipBwdWeights_pd, - *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); } if (copy_back_weight) - const_cast(in_grad[fullc::kWeight]).CopyFrom(*in_grad_weight, net); + const_cast(in_grad[fullc::kWeight]).CopyFrom(*in_grad_weight); if (copy_back_bias) - const_cast(in_grad[fullc::kBias]).CopyFrom(*in_grad_bias, net); + const_cast(in_grad[fullc::kBias]).CopyFrom(*in_grad_bias); } - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + MKLDNNStream::Instance().Submit(); } } diff --git a/src/operator/nn/mkldnn/mkldnn_relu-inl.h b/src/operator/nn/mkldnn/mkldnn_relu-inl.h index ada4bebe81d4..affb29ed7750 100644 --- a/src/operator/nn/mkldnn/mkldnn_relu-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_relu-inl.h @@ -61,11 +61,11 @@ void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, mkldnn::eltwise_relu, data_md, alpha); mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); - std::vector net; std::shared_ptr output_memory = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); - net.push_back(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); + stream.Submit(); } template @@ -92,12 +92,12 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, mkldnn::eltwise_backward::desc bw_desc(mkldnn::eltwise_relu, diff_md, data_md, alpha); mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); - std::vector net; std::shared_ptr diff_src_memory = const_cast(in_grad).CreateMKLDNNData(bw_pdesc.diff_src_primitive_desc()); - net.push_back(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, *diff_dst_memory, *diff_src_memory)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + stream.Submit(); } } // namespace op From d750f86599e01c1678d3671a6c1b86d979349f6a Mon Sep 17 00:00:00 2001 From: Da zheng Date: Tue, 7 Nov 2017 01:35:26 +0000 Subject: [PATCH 066/264] Handle kAddTo in MKLDNN operators. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 38 ++++++++++++ src/operator/nn/mkldnn/mkldnn_convolution.cc | 55 ++++++----------- .../nn/mkldnn/mkldnn_deconvolution.cc | 61 ++++++------------- .../nn/mkldnn/mkldnn_fully_connected.cc | 61 ++++++------------- src/operator/nn/mkldnn/mkldnn_relu-inl.h | 8 +-- src/operator/nn/mkldnn/mkldnn_sum.cc | 52 ++++++++++++++++ 6 files changed, 149 insertions(+), 126 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_sum.cc diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 733980ef54e8..6d6671c181a4 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -151,6 +151,44 @@ inline static mkldnn_mem_ptr CreateMKLDNNMem(const mkldnn::memory::primitive_des return ret; } +enum OutDataOp { + Noop, + CopyBack, + AddBack, +}; + +typedef std::pair mkldnn_output_t; + +static inline mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, OpReqType req) { + if (kAddTo == req) + return mkldnn_output_t(OutDataOp::AddBack, CreateMKLDNNMem(desc)); + else { + mkldnn_mem_ptr mem = const_cast(arr).CreateMKLDNNData(desc); + if (mem == nullptr) + return mkldnn_output_t(OutDataOp::CopyBack, CreateMKLDNNMem(desc)); + else + return mkldnn_output_t(OutDataOp::Noop, mem); + } +} + +namespace op { +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out); +} + +static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { + if (res.first == CopyBack) + const_cast(arr).CopyFrom(*res.second); + else if (res.first == AddBack) { + // TODO I might need to reorder. + mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + mkldnn_mem_ptr out = CreateMKLDNNMem(res.second->get_primitive_desc()); + op::Sum(*res.second, *mem, *out); + const_cast(arr).CopyFrom(*out); + } +} + inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, int num_groups) { mkldnn_mem_const_ptr mem; diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 28ee1874d6d8..61134d0d8021 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -182,18 +182,18 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct auto engine = CpuEngine::Instance().get_engine(); auto weight_mem = GetWeights(in_data[conv::kWeight], fwd_pd.weights_primitive_desc(), param.num_group); - - auto out_mem = const_cast(out_data[conv::kOut]).CreateMKLDNNData( - fwd_pd.dst_primitive_desc()); + auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], + fwd_pd.dst_primitive_desc(), req[conv::kOut]); if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, - *data_mem, *weight_mem, *out_mem)); + *data_mem, *weight_mem, *out_mem.second)); } else { auto bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd_pd.bias_primitive_desc()); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, - *data_mem, *weight_mem, *bias_mem, *out_mem)); + *data_mem, *weight_mem, *bias_mem, *out_mem.second)); } + CommitOutput(out_data[conv::kOut], out_mem); MKLDNNStream::Instance().Submit(); } @@ -216,17 +216,11 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c bwdData_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(inputs[conv::kWeight + 1], bwdData_pd.weights_primitive_desc(), param.num_group); - auto in_grad_mem = const_cast(in_grad[conv::kData]).CreateMKLDNNData( - bwdData_pd.diff_src_primitive_desc()); - bool copy_back = false; - if (in_grad_mem == nullptr) { - in_grad_mem = CreateMKLDNNMem(bwdData_pd.diff_src_primitive_desc()); - copy_back = true; - } + auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], + bwdData_pd.diff_src_primitive_desc(), req[conv::kData]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, - *out_grad_mem, *weight_mem, *in_grad_mem)); - if (copy_back) - const_cast(in_grad[conv::kData]).CopyFrom(*in_grad_mem); + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[conv::kData], in_grad_mem); } if (req[conv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd @@ -236,32 +230,21 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = const_cast(in_grad[conv::kWeight]).CreateMKLDNNData( - bwdWeights_pd.diff_weights_primitive_desc()); - bool copy_back_weight = false; - bool copy_back_bias = false; - if (in_grad_weight == nullptr) { - in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); - copy_back_weight = true; - } - mkldnn_mem_const_ptr in_grad_bias; + auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), req[conv::kWeight]); + mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { - in_grad_bias = const_cast(in_grad[conv::kBias]).CreateMKLDNNData( - bwdWeights_pd.diff_bias_primitive_desc()); - if (in_grad_bias == nullptr) { - in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); - copy_back_bias = true; - } + in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], + bwdWeights_pd.diff_bias_primitive_desc(), req[conv::kBias]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); } - if (copy_back_weight) - const_cast(in_grad[conv::kWeight]).CopyFrom(*in_grad_weight); - if (copy_back_bias) - const_cast(in_grad[conv::kBias]).CopyFrom(*in_grad_bias); + CommitOutput(in_grad[conv::kWeight], in_grad_weight); + CommitOutput(in_grad[conv::kBias], in_grad_bias); } MKLDNNStream::Instance().Submit(); } diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index f8675b637f62..8a8566432706 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -179,18 +179,12 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & deconvFwd_pd.diff_src_primitive_desc()); auto weight_mem = GetWeights(in_data[deconv::kWeight], deconvFwd_pd.weights_primitive_desc(), param.num_group); - auto out_mem = const_cast(out_data[deconv::kOut]).CreateMKLDNNData( - deconvFwd_pd.diff_dst_primitive_desc()); - bool copy_back = false; - if (out_mem == nullptr) { - out_mem = CreateMKLDNNMem(deconvFwd_pd.diff_dst_primitive_desc()); - copy_back = true; - } + auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], + deconvFwd_pd.diff_dst_primitive_desc(), req[deconv::kOut]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data( - deconvFwd_pd, *data_mem, *weight_mem, *out_mem)); - if (copy_back) - const_cast(out_data[deconv::kOut]).CopyFrom(*out_mem); + deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); + CommitOutput(out_data[deconv::kOut], out_mem); MKLDNNStream::Instance().Submit(); if (!param.no_bias) { // add bias, broadcast bias to dim 1: channel @@ -209,7 +203,6 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext const std::vector& outputs) { const std::vector &in_grad = outputs; const DeconvolutionParam& param = nnvm::get(attrs.parsed); - CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], nullptr, @@ -219,17 +212,11 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext bwdData_pd.src_primitive_desc()); auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], bwdData_pd.weights_primitive_desc(), param.num_group); - auto in_grad_mem = const_cast(in_grad[deconv::kData]).CreateMKLDNNData( - bwdData_pd.dst_primitive_desc()); - bool copy_back = false; - if (in_grad_mem == nullptr) { - in_grad_mem = CreateMKLDNNMem(bwdData_pd.dst_primitive_desc()); - copy_back = true; - } + auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], + bwdData_pd.dst_primitive_desc(), req[deconv::kData]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(bwdData_pd, - *out_grad_mem, *weight_mem, *in_grad_mem)); - if (copy_back) - const_cast(in_grad[deconv::kData]).CopyFrom(*in_grad_mem); + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[deconv::kData], in_grad_mem); } if (req[deconv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd @@ -237,37 +224,25 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext inputs[deconv::kWeight + 1], param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], inputs[deconv::kOut], bwdData_pd); - CHECK_NE(req[deconv::kWeight], kAddTo); auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData( - bwdWeights_pd.diff_weights_primitive_desc()); - bool copy_back_weight = false; - bool copy_back_bias = false; - if (in_grad_weight == nullptr) { - in_grad_weight = CreateMKLDNNMem(bwdWeights_pd.diff_weights_primitive_desc()); - copy_back_weight = true; - } - mkldnn_mem_const_ptr in_grad_bias; + auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); + mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight)); + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); } else { - in_grad_bias = const_cast(in_grad[deconv::kBias]).CreateMKLDNNData( - bwdWeights_pd.diff_bias_primitive_desc()); - if (in_grad_bias == nullptr) { - in_grad_bias = CreateMKLDNNMem(bwdWeights_pd.diff_bias_primitive_desc()); - copy_back_bias = true; - } + in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], + bwdWeights_pd.diff_bias_primitive_desc(), req[deconv::kBias]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight, *in_grad_bias)); + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second, + *in_grad_bias.second)); } - if (copy_back_weight) - const_cast(in_grad[deconv::kWeight]).CopyFrom(*in_grad_weight); - if (copy_back_bias) - const_cast(in_grad[deconv::kBias]).CopyFrom(*in_grad_bias); + CommitOutput(in_grad[deconv::kWeight], in_grad_weight); + CommitOutput(in_grad[deconv::kBias], in_grad_bias); } MKLDNNStream::Instance().Submit(); } diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 6e73fd50f95d..ae80dd8f9095 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -93,23 +93,17 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto data_mem = in_data[fullc::kData].GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); auto weight_mem = in_data[fullc::kWeight].GetMKLDNNDataReorder( ipFwd_pd.weights_primitive_desc()); - auto out_mem = const_cast(out_data[fullc::kOut]).CreateMKLDNNData( - ipFwd_pd.dst_primitive_desc()); - bool copy_back = false; - if (out_mem == nullptr) { - out_mem = CreateMKLDNNMem(ipFwd_pd.dst_primitive_desc()); - copy_back = true; - } + auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], + ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward( - ipFwd_pd, *data_mem, *weight_mem, *out_mem)); + ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); } else { auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, - *data_mem, *weight_mem, *bias_mem, *out_mem)); + *data_mem, *weight_mem, *bias_mem, *out_mem.second)); } - if (copy_back) - const_cast(out_data[fullc::kOut]).CopyFrom(*out_mem); + CommitOutput(out_data[fullc::kOut], out_mem); MKLDNNStream::Instance().Submit(); } @@ -131,17 +125,11 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, ipBwdData_pd.diff_dst_primitive_desc()); auto weight_mem = inputs[fullc::kWeight + 1].GetMKLDNNDataReorder( ipBwdData_pd.weights_primitive_desc()); - auto in_grad_mem = const_cast(in_grad[fullc::kData]).CreateMKLDNNData( - ipBwdData_pd.diff_src_primitive_desc()); - bool copy_back = false; - if (in_grad_mem == nullptr) { - in_grad_mem = CreateMKLDNNMem(ipBwdData_pd.diff_src_primitive_desc()); - copy_back = true; - } + auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], + ipBwdData_pd.diff_src_primitive_desc(), req[fullc::kData]); MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_data( - ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem)); - if (copy_back) - const_cast(in_grad[fullc::kData]).CopyFrom(*in_grad_mem); + ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[fullc::kData], in_grad_mem); } if (req[fullc::kWeight]) { mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd @@ -152,32 +140,21 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, ipBwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[fullc::kData + 1].GetMKLDNNDataReorder( ipBwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = const_cast(in_grad[fullc::kWeight]).CreateMKLDNNData( - ipBwdWeights_pd.diff_weights_primitive_desc()); - bool copy_back_weight = false; - bool copy_back_bias = false; - if (in_grad_weight == nullptr) { - in_grad_weight = CreateMKLDNNMem(ipBwdWeights_pd.diff_weights_primitive_desc()); - copy_back_weight = true; - } - mkldnn_mem_const_ptr in_grad_bias; + auto in_grad_weight = CreateMKLDNNMem(in_grad[fullc::kWeight], + ipBwdWeights_pd.diff_weights_primitive_desc(), req[fullc::kWeight]); + mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( - ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight)); + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { - in_grad_bias = const_cast(in_grad[fullc::kBias]).CreateMKLDNNData( - ipBwdWeights_pd.diff_bias_primitive_desc()); - if (in_grad_bias == nullptr) { - in_grad_bias = CreateMKLDNNMem(ipBwdWeights_pd.diff_bias_primitive_desc()); - copy_back_bias = true; - } + in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], + ipBwdWeights_pd.diff_bias_primitive_desc(), req[fullc::kBias]); MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( - ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight, *in_grad_bias)); + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); } - if (copy_back_weight) - const_cast(in_grad[fullc::kWeight]).CopyFrom(*in_grad_weight); - if (copy_back_bias) - const_cast(in_grad[fullc::kBias]).CopyFrom(*in_grad_bias); + CommitOutput(in_grad[fullc::kWeight], in_grad_weight); + CommitOutput(in_grad[fullc::kBias], in_grad_bias); } MKLDNNStream::Instance().Submit(); } diff --git a/src/operator/nn/mkldnn/mkldnn_relu-inl.h b/src/operator/nn/mkldnn/mkldnn_relu-inl.h index affb29ed7750..25ad61a5d68c 100644 --- a/src/operator/nn/mkldnn/mkldnn_relu-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_relu-inl.h @@ -76,9 +76,7 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, return; } - // TODO we need to handle req std::shared_ptr diff_dst_memory = out_grad.GetMKLDNNData(); - // TODO shouldn't it be out_data? std::shared_ptr input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); @@ -92,11 +90,11 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, mkldnn::eltwise_backward::desc bw_desc(mkldnn::eltwise_relu, diff_md, data_md, alpha); mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); - std::shared_ptr diff_src_memory - = const_cast(in_grad).CreateMKLDNNData(bw_pdesc.diff_src_primitive_desc()); + auto diff_src_memory = CreateMKLDNNMem(in_grad, bw_pdesc.diff_src_primitive_desc(), req); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, - *diff_dst_memory, *diff_src_memory)); + *diff_dst_memory, *diff_src_memory.second)); + CommitOutput(in_grad, diff_src_memory); stream.Submit(); } diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc new file mode 100644 index 000000000000..61ec1bbc4199 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_sum.cc + * \brief + * \author Da Zheng +*/ +#include + +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out) { + std::vector input_pds(2); + std::vector scales(2); + std::vector inputs; + input_pds[0] = arr1.get_primitive_desc(); + input_pds[1] = arr2.get_primitive_desc(); + CHECK(input_pds[0] == input_pds[1]); + scales[0] = 1; + scales[1] = 1; + inputs.push_back(arr1); + inputs.push_back(arr2); + mkldnn::sum::primitive_desc sum_pd(scales, input_pds); + MKLDNNStream::Instance().RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); +} + +} +} +#endif From 3b9395d436821994c2ed8f3e7388bd1c57e57679 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Tue, 7 Nov 2017 04:14:24 +0000 Subject: [PATCH 067/264] Fix a bug in deconvolution. --- src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 8a8566432706..7e5daf6ed251 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -176,11 +176,11 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & param, in_data[deconv::kData], in_data[deconv::kWeight], param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( - deconvFwd_pd.diff_src_primitive_desc()); + deconvFwd_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(in_data[deconv::kWeight], deconvFwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], - deconvFwd_pd.diff_dst_primitive_desc(), req[deconv::kOut]); + deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data( deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); @@ -225,9 +225,9 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], inputs[deconv::kOut], bwdData_pd); auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( - bwdWeights_pd.diff_dst_primitive_desc()); - auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); + auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); mkldnn_output_t in_grad_bias; From b3518aa2c2c55eec4d444e9eeb2dc4e20820f306 Mon Sep 17 00:00:00 2001 From: Da zheng Date: Tue, 7 Nov 2017 04:41:21 +0000 Subject: [PATCH 068/264] Fix bugs in NDArray. --- src/ndarray/ndarray.cc | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 369c96a6ce19..d81c7726c51a 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -257,20 +257,19 @@ void NDArray::set_fresh_out_grad(bool state) const { } #if MXNET_USE_MKLDNN == 1 -static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { +static inline bool same_shape(const TShape &shape, mkldnn::memory::primitive_desc pd) { + int ndims = pd.desc().data.ndims; if (shape.ndim() != ndims) return false; for (int i = 0; i < ndims; i++) - if (shape[i] != dims[i]) + if (shape[i] != pd.desc().data.dims[i]) return false; return true; } void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc().desc().data.dims, - Mkl_mem_->get_primitive_desc().desc().data.ndims)) { + if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc())) return; - } mkldnn::memory::dims dims(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) @@ -306,6 +305,10 @@ static int GetTypeSize(int dtype) { std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const { + // If the array size doesn't match, we should reset MKL memory. + if (ptr_->Mkl_mem_ && !same_shape(shape(), ptr_->Mkl_mem_->get_primitive_desc())) + ptr_->Mkl_mem_ = nullptr; + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; @@ -321,6 +324,10 @@ std::shared_ptr NDArray::GetMKLDNNData( std::shared_ptr NDArray::GetMKLDNNDataReorder( const mkldnn::memory::primitive_desc &desc) const { + // If the array size doesn't match, we should reset MKL memory. + if (ptr_->Mkl_mem_ && !same_shape(shape(), ptr_->Mkl_mem_->get_primitive_desc())) + ptr_->Mkl_mem_ = nullptr; + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; @@ -390,6 +397,7 @@ void NDArray::SetTBlob() const { } else if (stype == kMKLDNNStorage) { // TODO we may really need to convert format. CHECK_EQ(byte_offset_, 0); + ptr_->SetMKLMem(shape_, dtype_); dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); #endif } else { From c51576c01e38526ccf6e99061e241e8e92cfa4bf Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 7 Nov 2017 19:50:39 +0000 Subject: [PATCH 069/264] Revert "Fix bugs in NDArray." This reverts commit f5624a4aa9f9b9f9fe31f5e6cfa7a9752838fc4e. --- src/ndarray/ndarray.cc | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index d81c7726c51a..369c96a6ce19 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -257,19 +257,20 @@ void NDArray::set_fresh_out_grad(bool state) const { } #if MXNET_USE_MKLDNN == 1 -static inline bool same_shape(const TShape &shape, mkldnn::memory::primitive_desc pd) { - int ndims = pd.desc().data.ndims; +static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { if (shape.ndim() != ndims) return false; for (int i = 0; i < ndims; i++) - if (shape[i] != pd.desc().data.dims[i]) + if (shape[i] != dims[i]) return false; return true; } void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc())) + if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc().desc().data.dims, + Mkl_mem_->get_primitive_desc().desc().data.ndims)) { return; + } mkldnn::memory::dims dims(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) @@ -305,10 +306,6 @@ static int GetTypeSize(int dtype) { std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const { - // If the array size doesn't match, we should reset MKL memory. - if (ptr_->Mkl_mem_ && !same_shape(shape(), ptr_->Mkl_mem_->get_primitive_desc())) - ptr_->Mkl_mem_ = nullptr; - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; @@ -324,10 +321,6 @@ std::shared_ptr NDArray::GetMKLDNNData( std::shared_ptr NDArray::GetMKLDNNDataReorder( const mkldnn::memory::primitive_desc &desc) const { - // If the array size doesn't match, we should reset MKL memory. - if (ptr_->Mkl_mem_ && !same_shape(shape(), ptr_->Mkl_mem_->get_primitive_desc())) - ptr_->Mkl_mem_ = nullptr; - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; @@ -397,7 +390,6 @@ void NDArray::SetTBlob() const { } else if (stype == kMKLDNNStorage) { // TODO we may really need to convert format. CHECK_EQ(byte_offset_, 0); - ptr_->SetMKLMem(shape_, dtype_); dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); #endif } else { From 8e4acd1b2bc44391f74b49725dca0a8d4a65677f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 7 Nov 2017 20:13:41 +0000 Subject: [PATCH 070/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 369c96a6ce19..3a67f91d5453 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -267,8 +267,11 @@ static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims } void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - if (Mkl_mem_ && same_shape(shape, Mkl_mem_->get_primitive_desc().desc().data.dims, - Mkl_mem_->get_primitive_desc().desc().data.ndims)) { + // The shape of the array and the one of the MKL memory may mismatch. + // For example, if the array stores parameters, the MKL memory may store data + // in 5 dimensions while the NDArray stores data in 4 dimensions. + // TODO is it possible that the MKL memory is out-of-date? + if (Mkl_mem_) { return; } @@ -328,6 +331,10 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( if (ptr_->storage_type == kDefaultStorage) { ptr_->SetMKLMem(shape_, dtype_); } + // If the array uses the default format, the MKL memory now references to + // the default storage. If it uses the MKLDNN format, the MKL memory should + // have been initialized since we are trying to get data from the array. + CHECK(ptr_->Mkl_mem_ != nullptr); if (ptr_->Mkl_mem_->get_primitive_desc() == desc) return ptr_->Mkl_mem_; else { @@ -390,6 +397,7 @@ void NDArray::SetTBlob() const { } else if (stype == kMKLDNNStorage) { // TODO we may really need to convert format. CHECK_EQ(byte_offset_, 0); + ptr_->SetMKLMem(shape_, dtype_); dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); #endif } else { From e5f9c2fbd53bc560e7d6804c4bbdcf7322b3446e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 7 Nov 2017 22:17:38 +0000 Subject: [PATCH 071/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 3a67f91d5453..e8fb49119a85 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -271,7 +271,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // For example, if the array stores parameters, the MKL memory may store data // in 5 dimensions while the NDArray stores data in 4 dimensions. // TODO is it possible that the MKL memory is out-of-date? - if (Mkl_mem_) { + if (Mkl_mem_ && storage_type == kMKLDNNStorage) { return; } @@ -315,6 +315,7 @@ std::shared_ptr NDArray::GetMKLDNNData( } if (ptr_->Mkl_mem_) { CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } mkldnn_mem_const_ptr ret(new mkldnn::memory(desc, ptr_->shandle.dptr)); @@ -335,8 +336,10 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( // the default storage. If it uses the MKLDNN format, the MKL memory should // have been initialized since we are trying to get data from the array. CHECK(ptr_->Mkl_mem_ != nullptr); - if (ptr_->Mkl_mem_->get_primitive_desc() == desc) + if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; + } else { // TODO we should manage the memory allocation here. mkldnn_mem_ptr ret(new mkldnn::memory(desc)); @@ -349,8 +352,10 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( std::shared_ptr NDArray::GetMKLDNNData() const { ptr_->SetMKLMem(shape_, dtype_); - if (ptr_->Mkl_mem_) + if (ptr_->Mkl_mem_) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; + } else // TODO We don't support converting sparse format. return nullptr; @@ -375,8 +380,10 @@ std::shared_ptr NDArray::CreateMKLDNNData( return nullptr; } - if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) + if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; + } ptr_->Mkl_mem_ = CreateMKLDNNMem(desc); return ptr_->Mkl_mem_; From 7853520ad77ef832dda1c099bd6312f10e09904e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 01:25:38 +0000 Subject: [PATCH 072/264] Reorder MKLDNN memory to default format in SetTBlob. --- src/ndarray/ndarray.cc | 93 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e8fb49119a85..6cd0633463f2 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -266,6 +266,90 @@ static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims return true; } +static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { + if (desc.data.ndims == 1) + return desc.data.format; + else if (desc.data.ndims == 2) { + if (desc.data.format == mkldnn_io) + return mkldnn_oi; + else + return desc.data.format; + } + else if (desc.data.ndims == 4) { + switch (desc.data.format) { + case mkldnn_nchw: + case mkldnn_nhwc: + case mkldnn_chwn: + case mkldnn_nChw8c: + case mkldnn_nChw16c: + return mkldnn_nchw; + case mkldnn_oihw: + case mkldnn_ihwo: + case mkldnn_hwio: + case mkldnn_OIhw8i8o: + case mkldnn_OIhw16i16o: + case mkldnn_OIhw8i16o2i: + case mkldnn_OIhw8o16i2o: + case mkldnn_OIhw8o8i: + case mkldnn_OIhw16o16i: + case mkldnn_IOhw16o16i: + case mkldnn_Oihw8o: + case mkldnn_Oihw16o: + case mkldnn_Ohwi8o: + case mkldnn_Ohwi16o: + case mkldnn_OhIw16o4i: + return mkldnn_oihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else if (desc.data.ndims == 5) { + switch (desc.data.format) { + case mkldnn_goihw: + case mkldnn_gOIhw8i8o: + case mkldnn_gOIhw16i16o: + case mkldnn_gOIhw8i16o2i: + case mkldnn_gOIhw8o16i2o: + case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw16o16i: + case mkldnn_gIOhw16o16i: + case mkldnn_gOihw8o: + case mkldnn_gOihw16o: + case mkldnn_gOhwi8o: + case mkldnn_gOhwi16o: + case mkldnn_gOhIw16o4i: + return mkldnn_goihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else { + LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; + return mkldnn_format_undef; + } +} + +static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { + auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); + if (format == mem->get_primitive_desc().desc().data.format) + return mem; + + printf("reorder to default\n"); + mkldnn::memory::desc desc = mem->get_primitive_desc().desc(); + desc.data.format = format; + mkldnn::memory::primitive_desc pd(desc, mem->get_primitive_desc().get_engine()); + mkldnn_mem_ptr def_mem(new mkldnn::memory(pd)); + + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterMem(def_mem); + stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); + // TODO do I have to submit it here? + stream.Submit(); + return def_mem; +} + void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data @@ -318,6 +402,9 @@ std::shared_ptr NDArray::GetMKLDNNData( MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } + // If we are getting data from the NDArray, it has to use the default storage + // if Mkl_mem_ is null. + CHECK_EQ(ptr_->storage_type, kDefaultStorage); mkldnn_mem_const_ptr ret(new mkldnn::memory(desc, ptr_->shandle.dptr)); MKLDNNStream::Instance().RegisterMem(ret); return ret; @@ -366,6 +453,7 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { LOG(FATAL) << "The NDArray hasn't been initialized"; return; } + // TODO if the shape mismatches. ptr_->SetMKLMem(shape_, dtype_); MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); } @@ -404,7 +492,10 @@ void NDArray::SetTBlob() const { } else if (stype == kMKLDNNStorage) { // TODO we may really need to convert format. CHECK_EQ(byte_offset_, 0); - ptr_->SetMKLMem(shape_, dtype_); + if (ptr_->Mkl_mem_) + ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_); + else + ptr_->SetMKLMem(shape_, dtype_); dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); #endif } else { From d2af1f3f53f789385f9fd4a65a02d430d4842403 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 02:08:31 +0000 Subject: [PATCH 073/264] Disable MKLDNN correctly. --- src/ndarray/ndarray.cc | 31 +++++++++++++------------- src/operator/tensor/cast_storage-inl.h | 2 ++ src/operator/tensor/cast_storage.cc | 2 ++ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 6cd0633463f2..8ed5a5a470ca 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -101,24 +101,25 @@ NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ct } void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { +#if MXNET_USE_MKLDNN == 1 if (storage_type == kMKLDNNStorage) { SetMKLMem(shape, dtype); + return; } - else { - CHECK_NE(aux_shapes.size(), 0) - << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); - } - // init shape - storage_shape = shape; - // delay_alloc is only set when data storage handle is present - delay_alloc = false; - } +#endif + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; } NDArray NDArray::grad() const { diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index 8cb62bdaabac..41b4eaa1aeca 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -324,8 +324,10 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, }); } +#if MXNET_USE_MKLDNN == 1 void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns); void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); +#endif template void CastStorageComputeImpl(const OpContext& ctx, diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index f1c226c9c83e..d3dc89ee3519 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -30,6 +30,7 @@ namespace mxnet { namespace op { +#if MXNET_USE_MKLDNN == 1 static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { switch(dtype) { case mshadow::kFloat32: @@ -72,6 +73,7 @@ void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArr net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), *dst.GetMKLDNNData())); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } +#endif DMLC_REGISTER_PARAMETER(CastStorageParam); NNVM_REGISTER_OP(cast_storage) From f9cae1b8fd0d44c0e0e5292b61905475e3a3654b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 02:24:04 +0000 Subject: [PATCH 074/264] Fix a bug in activation. --- src/operator/nn/activation.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 1e18f4adfb5e..c9f80c1ac22a 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -141,8 +141,13 @@ inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, return true; } #endif - return ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, +#if MXNET_USE_CUDNN == 1 + return ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, dispatch_mode, in_attrs, out_attrs); +#else + return ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, in_attrs, out_attrs); +#endif } MXNET_OPERATOR_REGISTER_UNARY(Activation) From 3965435a726963a000a75c3921ce9a2158e77abe Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 23:34:07 +0000 Subject: [PATCH 075/264] Reshape of NDArray supports MKLDNN. --- src/ndarray/ndarray.cc | 193 ++++++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 91 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 8ed5a5a470ca..16eca93f6b01 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -140,17 +140,112 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { return ret; } +#if MXNET_USE_MKLDNN == 1 + +static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { + if (desc.data.ndims == 1) + return desc.data.format; + else if (desc.data.ndims == 2) { + if (desc.data.format == mkldnn_io) + return mkldnn_oi; + else + return desc.data.format; + } + else if (desc.data.ndims == 4) { + switch (desc.data.format) { + case mkldnn_nchw: + case mkldnn_nhwc: + case mkldnn_chwn: + case mkldnn_nChw8c: + case mkldnn_nChw16c: + return mkldnn_nchw; + case mkldnn_oihw: + case mkldnn_ihwo: + case mkldnn_hwio: + case mkldnn_OIhw8i8o: + case mkldnn_OIhw16i16o: + case mkldnn_OIhw8i16o2i: + case mkldnn_OIhw8o16i2o: + case mkldnn_OIhw8o8i: + case mkldnn_OIhw16o16i: + case mkldnn_IOhw16o16i: + case mkldnn_Oihw8o: + case mkldnn_Oihw16o: + case mkldnn_Ohwi8o: + case mkldnn_Ohwi16o: + case mkldnn_OhIw16o4i: + return mkldnn_oihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else if (desc.data.ndims == 5) { + switch (desc.data.format) { + case mkldnn_goihw: + case mkldnn_gOIhw8i8o: + case mkldnn_gOIhw16i16o: + case mkldnn_gOIhw8i16o2i: + case mkldnn_gOIhw8o16i2o: + case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw16o16i: + case mkldnn_gIOhw16o16i: + case mkldnn_gOihw8o: + case mkldnn_gOihw16o: + case mkldnn_gOhwi8o: + case mkldnn_gOhwi16o: + case mkldnn_gOhIw16o4i: + return mkldnn_goihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } + else { + LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; + return mkldnn_format_undef; + } +} + +static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { + auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); + if (format == mem->get_primitive_desc().desc().data.format) + return mem; + + mkldnn::memory::desc desc = mem->get_primitive_desc().desc(); + desc.data.format = format; + mkldnn::memory::primitive_desc pd(desc, mem->get_primitive_desc().get_engine()); + mkldnn_mem_ptr def_mem(new mkldnn::memory(pd)); + + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterMem(def_mem); + stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); + // TODO do I have to submit it here? + stream.Submit(); + return def_mem; +} + +#endif + NDArray NDArray::Reshape(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; - auto stype = storage_type(); - // reshape is not supported for non-default ndarray with dismatching shapes - CHECK((shape_ == shape) || stype == kDefaultStorage) - << "Reshape for storage type " << stype << " is not implemented yet"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; - NDArray ret = this->Detach(); - ret.shape_ = shape; - return ret; + if (storage_type() == kDefaultStorage) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + return ret; +#if MXNET_USE_MKLDNN == 1 + } else if (storage_type() == kMKLDNNStorage) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + if (ret.ptr_->Mkl_mem_) + ret.ptr_->Mkl_mem_ = Reorder2Default(ret.ptr_->Mkl_mem_); + return ret; +#endif + } + LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; + return NDArray(); } NDArray NDArray::ReshapeWithRecord(const TShape &shape) { @@ -267,90 +362,6 @@ static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims return true; } -static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { - if (desc.data.ndims == 1) - return desc.data.format; - else if (desc.data.ndims == 2) { - if (desc.data.format == mkldnn_io) - return mkldnn_oi; - else - return desc.data.format; - } - else if (desc.data.ndims == 4) { - switch (desc.data.format) { - case mkldnn_nchw: - case mkldnn_nhwc: - case mkldnn_chwn: - case mkldnn_nChw8c: - case mkldnn_nChw16c: - return mkldnn_nchw; - case mkldnn_oihw: - case mkldnn_ihwo: - case mkldnn_hwio: - case mkldnn_OIhw8i8o: - case mkldnn_OIhw16i16o: - case mkldnn_OIhw8i16o2i: - case mkldnn_OIhw8o16i2o: - case mkldnn_OIhw8o8i: - case mkldnn_OIhw16o16i: - case mkldnn_IOhw16o16i: - case mkldnn_Oihw8o: - case mkldnn_Oihw16o: - case mkldnn_Ohwi8o: - case mkldnn_Ohwi16o: - case mkldnn_OhIw16o4i: - return mkldnn_oihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } - else if (desc.data.ndims == 5) { - switch (desc.data.format) { - case mkldnn_goihw: - case mkldnn_gOIhw8i8o: - case mkldnn_gOIhw16i16o: - case mkldnn_gOIhw8i16o2i: - case mkldnn_gOIhw8o16i2o: - case mkldnn_gOIhw8o8i: - case mkldnn_gOIhw16o16i: - case mkldnn_gIOhw16o16i: - case mkldnn_gOihw8o: - case mkldnn_gOihw16o: - case mkldnn_gOhwi8o: - case mkldnn_gOhwi16o: - case mkldnn_gOhIw16o4i: - return mkldnn_goihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } - else { - LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; - return mkldnn_format_undef; - } -} - -static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { - auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); - if (format == mem->get_primitive_desc().desc().data.format) - return mem; - - printf("reorder to default\n"); - mkldnn::memory::desc desc = mem->get_primitive_desc().desc(); - desc.data.format = format; - mkldnn::memory::primitive_desc pd(desc, mem->get_primitive_desc().get_engine()); - mkldnn_mem_ptr def_mem(new mkldnn::memory(pd)); - - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterMem(def_mem); - stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); - // TODO do I have to submit it here? - stream.Submit(); - return def_mem; -} - void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data From 69cce011df508c958865e05eff203c476c4b737f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 23:34:39 +0000 Subject: [PATCH 076/264] Fix a memory ref bug in NDArray. --- src/ndarray/ndarray.cc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 16eca93f6b01..78592b2dc6db 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -435,14 +435,29 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( // the default storage. If it uses the MKLDNN format, the MKL memory should // have been initialized since we are trying to get data from the array. CHECK(ptr_->Mkl_mem_ != nullptr); + // If the memory descriptor matches, it's easy. + MKLDNNStream &stream = MKLDNNStream::Instance(); + // We need to make sure Mkl_mem_ is always valid as well. + stream.RegisterMem(ptr_->Mkl_mem_); if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } + + mkldnn::memory::primitive_desc _desc = desc; + // Now we need to determine if we should reorder the memory. + // If both use the default formats, we think we don't need to reshape. + // TODO if the memory format isn't the default one, it may not work. + auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + if (desc1.data.format == GetDefaultFormat(desc1) && + desc2.data.format == GetDefaultFormat(desc2)) { + mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); + stream.RegisterMem(ret); + return ret; + } else { // TODO we should manage the memory allocation here. mkldnn_mem_ptr ret(new mkldnn::memory(desc)); - MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterMem(ret); stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; From 4c4d73c204ba4f056d430247bf60696106f70a39 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 23:36:25 +0000 Subject: [PATCH 077/264] Reshape NDArray in MKLDNN FullyConnected. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 11 +++- .../nn/mkldnn/mkldnn_fully_connected.cc | 60 ++++++++++++------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 6d6671c181a4..38ee74d83ce0 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -94,20 +94,25 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { } } -inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { - mkldnn::memory::dims dims(arr.shape().ndim()); +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) { + mkldnn::memory::dims dims(ndim); for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i]; return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), mkldnn::memory::format::any}; } +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { + return GetMemDesc(arr, arr.shape().ndim()); +} + inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, - int num_groups = 1) { + int num_groups) { if (num_groups == 1) { return GetMemDesc(arr); } else { + CHECK_EQ(arr.shape().ndim(), 4U); mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index ae80dd8f9095..2a9e1ba4f7d8 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -34,7 +34,7 @@ inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &output) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weight); + auto weight_md = GetMemDesc(weight); auto out_md = GetMemDesc(output); auto engine = CpuEngine::Instance().get_engine(); if (bias) { @@ -54,7 +54,7 @@ inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( const NDArray &data, const NDArray &weight, const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weight); + auto weight_md = GetMemDesc(weight); auto out_md = GetMemDesc(output); auto engine = CpuEngine::Instance().get_engine(); mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); @@ -65,7 +65,7 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weight); + auto weight_md = GetMemDesc(weight); auto out_md = GetMemDesc(output); auto engine = CpuEngine::Instance().get_engine(); if (bias) { @@ -87,12 +87,18 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( - in_data[fullc::kData], in_data[fullc::kWeight], + const TShape& ishape = in_data[fullc::kData].shape(); + NDArray weight = in_data[fullc::kWeight]; + NDArray data = in_data[fullc::kData]; + if (data.shape().ndim() > 2 && !param.flatten) + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); + else if (data.shape().ndim() > 2) + data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, param.no_bias ? nullptr : &in_data[fullc::kBias], out_data[fullc::kOut]); - auto data_mem = in_data[fullc::kData].GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); - auto weight_mem = in_data[fullc::kWeight].GetMKLDNNDataReorder( - ipFwd_pd.weights_primitive_desc()); + auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); if (param.no_bias) { @@ -112,19 +118,31 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &outputs) { const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd( - inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], - param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut]); + const TShape& ishape = inputs[fullc::kData + 1].shape(); + const TShape& oshape = inputs[fullc::kOut].shape(); + + NDArray weight = inputs[fullc::kWeight + 1]; + NDArray data = inputs[fullc::kData + 1]; + if (data.shape().ndim() > 2 && !param.flatten) + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); + else if (data.shape().ndim() > 2) + data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + NDArray out_grad = inputs[fullc::kOut]; + if (out_grad.shape().ndim() > 2 && !param.flatten) + out_grad = out_grad.Reshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1])); + else if (out_grad.shape().ndim() > 2) + out_grad = out_grad.Reshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim()))); + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, + param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad); CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; if (req[fullc::kData]) { mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( - inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], inputs[fullc::kOut], - ipFwd_pd); - auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNDataReorder( + data, weight, out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( ipBwdData_pd.diff_dst_primitive_desc()); - auto weight_mem = inputs[fullc::kWeight + 1].GetMKLDNNDataReorder( - ipBwdData_pd.weights_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], ipBwdData_pd.diff_src_primitive_desc(), req[fullc::kData]); MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_data( @@ -133,13 +151,11 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, } if (req[fullc::kWeight]) { mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd - = GetIPBwdWeights(inputs[fullc::kData + 1], inputs[fullc::kWeight + 1], - param.no_bias ? nullptr : &in_grad[fullc::kBias], inputs[fullc::kOut], - ipFwd_pd); - auto out_grad_mem = inputs[fullc::kOut].GetMKLDNNDataReorder( + = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], + out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( ipBwdWeights_pd.diff_dst_primitive_desc()); - auto data_mem = inputs[fullc::kData + 1].GetMKLDNNDataReorder( - ipBwdWeights_pd.src_primitive_desc()); + auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[fullc::kWeight], ipBwdWeights_pd.diff_weights_primitive_desc(), req[fullc::kWeight]); mkldnn_output_t in_grad_bias; From 625951c4ce60a140439b89ac5731d24459ea5032 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 8 Nov 2017 23:37:11 +0000 Subject: [PATCH 078/264] Fix data format conversion. --- src/operator/tensor/cast_storage.cc | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index d3dc89ee3519..9d6e2ec20759 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -40,28 +40,19 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { } } +static inline int get_type_size(int dtype) { + MSHADOW_TYPE_SWITCH(dtype, DType, {return sizeof(DType);}); + return -1; +} + void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) { CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); CHECK(src.shape() == dns->shape_); CHECK_EQ(src.dtype(), dns->type_flag_); - - mkldnn::memory::dims dims(dns->shape_.ndim()); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = dns->shape_[i]; - mkldnn::memory::format layout = mkldnn::memory::format::format_undef; - switch (dns->shape_.ndim()) { - case 1: layout = mkldnn::memory::format::x; break; - case 2: layout = mkldnn::memory::format::nc; break; - case 4: layout = mkldnn::memory::format::nchw; break; - default: LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; - } - mkldnn::memory::desc data_md({dims}, get_mkldnn_type(src.dtype()), layout); - auto cpu_engine = CpuEngine::Instance().get_engine(); - mkldnn::memory dst_mem(mkldnn::memory::primitive_desc(data_md, cpu_engine), dns->dptr_); - - std::vector net; - net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), dst_mem)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + // This converts the source data to the default format and copy the data to + // the destination. + const TBlob &src_blob = src.data(); + memcpy(dns->dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns->type_flag_)); } void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { From 15ee48437152bc5c6e90536643f4ce5ee4f30c65 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 23:21:26 +0000 Subject: [PATCH 079/264] Remove MKL code in dropout. --- src/operator/nn/dropout-inl.h | 55 ----------------------------------- 1 file changed, 55 deletions(-) diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 343201062dbe..1d2c9eaeb456 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -38,13 +38,6 @@ #include "../operator_common.h" #include "../mshadow_op.h" -#if defined(USE_MKL) && defined(_OPENMP) -#include - -#include -#include -#endif // USE_MKL && _OPENMP - namespace dropout { enum DropoutOpInputs {kData}; enum DropoutOpOutputs {kOut, kMask}; @@ -55,28 +48,6 @@ enum DropoutOpMode {kTraining, kAlways}; namespace mxnet { namespace op { -#if defined(USE_MKL) && defined(_OPENMP) -static void bernoulli_generate(int n, double p, int* r) { - const int seed = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) - const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); -# pragma omp parallel num_threads(nthr) - { - const int ithr = omp_get_thread_num(); - const int avg_amount = (n + nthr - 1) / nthr; - const int my_offset = ithr * avg_amount; - const int my_amount = std::min(my_offset + avg_amount, n) - my_offset; - if (my_amount > 0) { - VSLStreamStatePtr stream; - vslNewStream(&stream, VSL_BRNG_MCG31, seed); - vslSkipAheadStream(stream, my_offset); - viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, my_amount, - r + my_offset, p); - vslDeleteStream(&stream); - } - } -} -#endif // USE_MKL && _OPENMP - struct DropoutParam : public dmlc::Parameter { float p; int mode; @@ -109,23 +80,10 @@ void DropoutForward(const OpContext &ctx, const DropoutParam ¶m, Tensor out = out_data[dropout::kOut].FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { Tensor mask = out_data[dropout::kMask].FlatTo2D(s); -#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) - DType* outptr = out.dptr_; - DType* dataptr = data.dptr_; - auto maskptr = reinterpret_cast(mask.dptr_); - int count = mask.shape_[0]*mask.shape_[1]; - bernoulli_generate(count, pkeep_, maskptr); - const float pk_1 = 1.0f / pkeep_; -#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) - for (int i = 0; i < count; ++i) { - outptr[i] = dataptr[i] * maskptr[i] * pk_1; - } -#else Random *prnd = ctx.requested[dropout::kRandom].get_random(s); mask = tcast(F( prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_)); Assign(out, req[dropout::kOut], data * mask); -#endif // USE_MKL && _OPENMP } else { Assign(out, req[dropout::kOut], F(data)); } @@ -143,20 +101,7 @@ void DropoutBackward(const OpContext &ctx, const DropoutParam ¶m, Tensor mask = out_data_mask.FlatTo2D(s); Tensor gdata = in_grad.FlatTo2D(s); if (ctx.is_train || mode_ == dropout::kAlways) { -#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP) - real_t pkeep_ = 1.0f - param.p; - DType* ingradptr = gdata.dptr_; - DType* outgradptr = grad.dptr_; - auto maskptr = reinterpret_cast(mask.dptr_); - int count = mask.shape_[0]*mask.shape_[1]; - const float pk_1 = 1.0f / pkeep_; -#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) - for (int i = 0; i < count; ++i) { - ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1; - } -#else // USE_MKL && _OPENMP Assign(gdata, req, grad * mask); -#endif // USE_MKL && _OPENMP } else { Assign(gdata, req, F(grad)); } From 87796126b76d92842e46c91916c833c232e8ef3e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 10 Nov 2017 01:15:08 +0000 Subject: [PATCH 080/264] Create MKLDNN NDArray in python. --- python/mxnet/ndarray/mkldnn.py | 113 +++++++++++++++++++++++++++++++++ python/mxnet/ndarray/sparse.py | 3 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 python/mxnet/ndarray/mkldnn.py diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py new file mode 100644 index 000000000000..e90fd77a34db --- /dev/null +++ b/python/mxnet/ndarray/mkldnn.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=wildcard-import, unused-wildcard-import, too-many-lines + +"""MKLDNN NDArray API of MXNet.""" + +from __future__ import absolute_import +from __future__ import division +try: + from __builtin__ import slice as py_slice + from __builtin__ import sum as py_sum +except ImportError: + from builtins import slice as py_slice + from builtins import sum as py_sum + +import ctypes +import warnings + +__all__ = ["_ndarray_cls", "MKLNDArray"] + +import numpy as np +from ..base import _LIB, numeric_types +from ..base import c_array, mx_real_t, integer_types +from ..base import mx_uint, NDArrayHandle, check_call +from ..context import Context +from . import _internal +from . import op +from ._internal import _set_ndarray_class +from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP +from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_MKLDNN +from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT +from .ndarray import zeros as _zeros_ndarray +from .ndarray import array as _array + +class MKLNDArray(NDArray): + """The base class of an NDArray stored in a MKLDNN storage format. + """ + + def __repr__(self): + """Returns a string representation of the sparse array.""" + shape_info = 'x'.join(['%d' % x for x in self.shape]) + # The data content is not displayed since the array usually has big shape + return '\n<%s %s @%s>' % (self.__class__.__name__, + shape_info, self.context) + + # TODO + def _at(self, idx): + raise NotSupportedForMKLNDArray(self._at, '[idx]', idx) + + def _slice(self, start, stop): + return op.slice(self, begin=start, end=stop) + + # TODO + def astype(self, dtype): + """Returns a copy of the array after casting to a specified type. + Parameters + ---------- + dtype : numpy.dtype or str + The type of the returned array. + Examples + -------- + >>> x = mx.nd.sparse.zeros('row_sparse', (2,3), dtype='float32') + >>> y = x.astype('int32') + >>> y.dtype + + """ + res = zeros(shape=self.shape, ctx=self.context, + dtype=dtype, stype=self.stype) + self.copyto(res) + return res + + # TODO + def copyto(self, other): + """Copies the value of this array to another array. + + Parameters + ---------- + other : NDArray or CSRNDArray or RowSparseNDArray or Context + The destination array or context. + + Returns + ------- + NDArray or CSRNDArray or RowSparseNDArray + The copied array. + """ + if isinstance(other, NDArray): + if other.handle is self.handle: + warnings.warn('You are attempting to copy an array to itself', RuntimeWarning) + return + return _internal._copyto(self, out=other) + elif isinstance(other, Context): + hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other, + True, self.dtype, self._aux_types)) + return _internal._copyto(self, out=hret) + else: + raise TypeError('copyto does not support type ' + str(type(other))) + diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index 0a667741e144..396163fccb66 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -49,6 +49,7 @@ pass from ._internal import _set_ndarray_class from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP +from .mkldnn import MKLNDArray from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR, _STORAGE_TYPE_MKLDNN from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT from .ndarray import zeros as _zeros_ndarray @@ -1139,7 +1140,7 @@ def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED): if stype == _STORAGE_TYPE_DEFAULT: return NDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_MKLDNN: - return NDArray(handle, writable=False) + return MKLNDArray(handle, writable=False) elif stype == _STORAGE_TYPE_CSR: return CSRNDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_ROW_SPARSE: From bc39849d3b214a9826c91daaf7983460c482d3a4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 11 Nov 2017 00:19:30 +0000 Subject: [PATCH 081/264] Support Slice for MKLDNN NDArray. --- src/ndarray/ndarray.cc | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 78592b2dc6db..f1a077cef7d1 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -266,12 +266,34 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) { return ret; } - NDArray NDArray::Slice(index_t begin, index_t end) const { CHECK(!is_none()) << "NDArray is empty"; CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; +#if MXNET_USE_MKLDNN == 1 + CHECK(storage_type() == kDefaultStorage || storage_type() == kMKLDNNStorage); + if (storage_type() == kMKLDNNStorage) { + TShape new_shape = shape_; + new_shape[0] = end - begin; + NDArray ret(kMKLDNNStorage, new_shape, ctx(), ptr_->delay_alloc, dtype()); + size_t length = shape_.ProdShape(1, shape_.ndim()); + MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { + ret.byte_offset_ += begin * length * sizeof(DType); + }); + + // We need to convert the MKL memory to the default layout. + Engine::Get()->PushSync([&](RunContext ctx) { + auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); + if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { + ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); + } + + }, ctx(), {this->var()}, {ret.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); + return ret; + } +#endif CHECK_EQ(storage_type(), kDefaultStorage); NDArray ret = this->Detach(); size_t length = shape_.ProdShape(1, shape_.ndim()); @@ -480,6 +502,7 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { LOG(FATAL) << "The NDArray hasn't been initialized"; return; } + // TODO if the shape mismatches. ptr_->SetMKLMem(shape_, dtype_); MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); @@ -517,8 +540,6 @@ void NDArray::SetTBlob() const { shape = storage_shape(); #if MXNET_USE_MKLDNN == 1 } else if (stype == kMKLDNNStorage) { - // TODO we may really need to convert format. - CHECK_EQ(byte_offset_, 0); if (ptr_->Mkl_mem_) ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_); else From 25afbd70d1494d8919664181fcbb42846968cc0e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 11 Nov 2017 00:22:00 +0000 Subject: [PATCH 082/264] Reduce the overhead of summing the result to the output array. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 38ee74d83ce0..14a04defdde7 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -188,9 +188,7 @@ static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) else if (res.first == AddBack) { // TODO I might need to reorder. mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); - mkldnn_mem_ptr out = CreateMKLDNNMem(res.second->get_primitive_desc()); - op::Sum(*res.second, *mem, *out); - const_cast(arr).CopyFrom(*out); + op::Sum(*res.second, *mem, *mem); } } From be6f3e779870e8242cf1424b849115e5c8469688 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 11 Nov 2017 02:13:26 +0000 Subject: [PATCH 083/264] Avoid unnecessary memory copy in NDArray. --- src/ndarray/ndarray.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f1a077cef7d1..34f41ec9cfaf 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -502,6 +502,8 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { LOG(FATAL) << "The NDArray hasn't been initialized"; return; } + if (ptr_->Mkl_mem_.get() == &mem) + return; // TODO if the shape mismatches. ptr_->SetMKLMem(shape_, dtype_); @@ -510,9 +512,19 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { std::shared_ptr NDArray::CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc) { - if (storage_type() != kMKLDNNStorage) + mkldnn::memory::primitive_desc _desc = desc; + auto required_format = _desc.desc().data.format; + auto def_format = GetDefaultFormat(_desc.desc()); + if (storage_type() != kMKLDNNStorage && required_format != def_format) return nullptr; + if (required_format == def_format) { + ptr_->SetMKLMem(shape_, dtype_); + CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + return ptr_->Mkl_mem_; + } + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; From 6f12fb3594879228f2275c6b9cfeae99d1254c09 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 13 Nov 2017 23:47:43 +0000 Subject: [PATCH 084/264] Fix a bug in data reordering. --- src/ndarray/ndarray.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 34f41ec9cfaf..1673098a45d2 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -212,10 +212,16 @@ static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { if (format == mem->get_primitive_desc().desc().data.format) return mem; - mkldnn::memory::desc desc = mem->get_primitive_desc().desc(); - desc.data.format = format; - mkldnn::memory::primitive_desc pd(desc, mem->get_primitive_desc().get_engine()); - mkldnn_mem_ptr def_mem(new mkldnn::memory(pd)); + auto pd = mem->get_primitive_desc(); + mkldnn::memory::dims dims(pd.desc().data.ndims); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast(format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn_mem_ptr def_mem(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, + pd.get_engine()))); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterMem(def_mem); From 6301b0da702798bb3c2ced70cf0968f1b9fa89ae Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 14 Nov 2017 19:14:03 +0000 Subject: [PATCH 085/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 1673098a45d2..9a0b7b092d48 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -524,18 +524,19 @@ std::shared_ptr NDArray::CreateMKLDNNData( if (storage_type() != kMKLDNNStorage && required_format != def_format) return nullptr; + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + + // If the required format is a default format, we don't need to worry about the shape. + // If the shape isn't the same, it actually implicitly reshapes data. if (required_format == def_format) { ptr_->SetMKLMem(shape_, dtype_); - CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { - LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; - return nullptr; - } - if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; From 9c62198da3a09efc5e5fcf87830fb427624a0c6d Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 14 Nov 2017 19:22:26 +0000 Subject: [PATCH 086/264] Don't hard code MKLDNN type. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 14a04defdde7..3c36761a81f1 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -195,28 +195,26 @@ static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, int num_groups) { mkldnn_mem_const_ptr mem; + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Instance().get_engine(); if (arr.shape().ndim() == 2) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], (int) arr.shape()[1]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oi}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::goihw}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); } @@ -234,27 +232,25 @@ inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1) { + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); if (arr.shape().ndim() == 2) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], (int) arr.shape()[1]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oi}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::goihw}; + mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); } From 59d2ab4882b33fed619b89c9fdbab060035fcfb5 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 16 Nov 2017 00:32:47 +0000 Subject: [PATCH 087/264] Support dilation in MKLDNN convolution. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 115 +++++++++---------- 1 file changed, 57 insertions(+), 58 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 61134d0d8021..e152a29fc92f 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -49,38 +49,38 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { + if (param.dilate.ndim() == 0 && bias == nullptr) { mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } - else /*if (param.dilate.ndim() == 0)*/ { + else if (param.dilate.ndim() == 0) { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md, weight_md, bias_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } -// else { -// // TODO I should test the case with dilate. -// mkldnn::memory::dims dilates{0, 0}; -// if (param.dilate.ndim() == 2) { -// dilates[0] = param.dilate[0]; -// dilates[1] = param.dilate[1]; -// } -// if (bias_mem == nullptr) { -// mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, -// data_md, weights_md, out_md, strides, dilates, padding, padding, -// mkldnn::padding_kind::zero); -// return mkldnn::convolution_forward::primitive_desc(desc, engine); -// } -// else { -// mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, -// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, -// strides, dilates, padding, padding, mkldnn::padding_kind::zero); -// return mkldnn::convolution_forward::primitive_desc(desc, engine); -// } -// } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + } } static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( @@ -100,23 +100,22 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } -// if (param.dilate.ndim() == 0) { + if (param.dilate.ndim() == 0) { mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); -// } -// else { -// // TODO I should test the case with dilate. -// mkldnn::memory::dims dilates{0, 0}; -// if (param.dilate.ndim() == 2) { -// dilates[0] = param.dilate[0]; -// dilates[1] = param.dilate[1]; -// } -// mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, -// data_md, weights_md, out_md, strides, dilates, padding, padding, -// mkldnn::padding_kind::zero); -// return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); -// } + } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); + } } static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( @@ -137,38 +136,38 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { + if (param.dilate.ndim() == 0 && bias == nullptr) { mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } - else /*if (param.dilate.ndim() == 0)*/ { + else if (param.dilate.ndim() == 0) { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, bias_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } -// else { -// // TODO I should test the case with dilate. -// mkldnn::memory::dims dilates{0, 0}; -// if (param.dilate.ndim() == 2) { -// dilates[0] = param.dilate[0]; -// dilates[1] = param.dilate[1]; -// } -// if (bias_mem == nullptr) { -// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, -// data_md, weights_md, out_md, strides, dilates, padding, padding, -// mkldnn::padding_kind::zero); -// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); -// } -// else { -// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, -// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, -// strides, dilates, padding, padding, mkldnn::padding_kind::zero); -// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); -// } -// } + else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + } } void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, From 8fa58eb2c930f189253097084562c9c51ef00417 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 16 Nov 2017 03:08:02 +0000 Subject: [PATCH 088/264] Fix a bug in sum results. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 3c36761a81f1..c13f29a3a6ea 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -188,7 +188,11 @@ static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) else if (res.first == AddBack) { // TODO I might need to reorder. mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); - op::Sum(*res.second, *mem, *mem); + // We have to allocate new memory for the sum result. + mkldnn_mem_ptr sum_res(new mkldnn::memory(res.second->get_primitive_desc())); + MKLDNNStream::Instance().RegisterMem(sum_res); + op::Sum(*res.second, *mem, *sum_res); + const_cast(arr).CopyFrom(*sum_res); } } From ba1be8fce8e8cd2ea9dd58b9c56aa533746447ad Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 16 Nov 2017 03:29:47 +0000 Subject: [PATCH 089/264] Rewrite GetMKLDNNData. --- src/ndarray/ndarray.cc | 27 +++++++++++++++++------- src/operator/nn/mkldnn/mkldnn_base-inl.h | 1 + 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 9a0b7b092d48..e6186ed10ca9 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -437,17 +437,28 @@ std::shared_ptr NDArray::GetMKLDNNData( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } - if (ptr_->Mkl_mem_) { - CHECK(ptr_->Mkl_mem_->get_primitive_desc() == desc); + if (ptr_->storage_type == kDefaultStorage) { + ptr_->SetMKLMem(shape_, dtype_); + } + CHECK(ptr_->Mkl_mem_ != nullptr); + mkldnn::memory::primitive_desc _desc = desc; + auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + // The MKL memory has the same format and shape as required, + // or both use the default format, we can return the MKL memory. + if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } - // If we are getting data from the NDArray, it has to use the default storage - // if Mkl_mem_ is null. - CHECK_EQ(ptr_->storage_type, kDefaultStorage); - mkldnn_mem_const_ptr ret(new mkldnn::memory(desc, ptr_->shandle.dptr)); - MKLDNNStream::Instance().RegisterMem(ret); - return ret; + else if (desc1.data.format == GetDefaultFormat(desc1) + && desc2.data.format == GetDefaultFormat(desc2)) { + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; + } + else + return nullptr; } std::shared_ptr NDArray::GetMKLDNNDataReorder( diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index c13f29a3a6ea..33b9884e6252 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -188,6 +188,7 @@ static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) else if (res.first == AddBack) { // TODO I might need to reorder. mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + CHECK(mem != nullptr); // We have to allocate new memory for the sum result. mkldnn_mem_ptr sum_res(new mkldnn::memory(res.second->get_primitive_desc())); MKLDNNStream::Instance().RegisterMem(sum_res); From 81493c710a1fb885b6d25848111e080f54638780 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 16 Nov 2017 19:12:36 +0000 Subject: [PATCH 090/264] Add prepare_mkldnn.sh --- prepare_mkldnn.sh | 121 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100755 prepare_mkldnn.sh diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh new file mode 100755 index 000000000000..7a4fe4ce5207 --- /dev/null +++ b/prepare_mkldnn.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# set -ex +# +# All modification made by Intel Corporation: © 2016 Intel Corporation +# +# All contributions by the University of California: +# Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +# All rights reserved. +# +# All other contributions: +# Copyright (c) 2014, 2015, the respective contributors +# All rights reserved. +# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md +# +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Intel Corporation nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +MXNET_ROOTDIR="$(pwd)" +MKLDNN_ROOTDIR="$MXNET_ROOTDIR/external/mkldnn" +MKLDNN_GITHUB="https://github.com/01org/mkl-dnn.git" +MKLDNN_TMPDIR="$MKLDNN_ROOTDIR/tmp" +MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" +MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" +MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" + +# MKL DNN release tag, or commit. +MKLDNN_COMMIT="v0.11" + +# MKLDNN install destination +HOME_MKLDNN=$1 +if [ ! -z "$HOME_MKLDNN" ]; then + mkdir -p $HOME_MKLDNN + if [ ! -w $HOME_MKLDNN ]; then + echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2 + exit 1 + fi +fi + +if [ -z $MKLDNNROOT ]; then +if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then + mkdir -p $MKLDNN_INSTALLDIR + if [ ! -d $MKLDNN_SRCDIR/.git ]; then + echo "Downloading MKLDNN ..." >&2 + rm -rf $MKLDNN_SRCDIR + git clone --quiet --no-checkout $MKLDNN_GITHUB $MKLDNN_TMPDIR + rsync -a $MKLDNN_TMPDIR/ $MKLDNN_SRCDIR && rm -rf $MKLDNN_TMPDIR + fi + cd $MKLDNN_SRCDIR && git fetch --all && git reset --hard $MKLDNN_COMMIT + if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then + rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. + cp -a external/*/* $MKLDNN_INSTALLDIR/. + fi + echo "Building MKLDNN ..." >&2 + cd $MXNET_ROOTDIR + cmake $MKLDNN_SRCDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR + make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) + make -C $MKLDNN_BUILDDIR install + rm -rf $MKLDNN_BUILDDIR +fi +MKLDNNROOT=$MKLDNN_INSTALLDIR +fi + +if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then + MKLROOT=$MKLDNNROOT; +fi + +# user specified MKLDNN install folder +if [ -d "$HOME_MKLDNN" ]; then + # skip if user specificed MKLDNNROOT + [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/. + [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/. + # update ldconfig if possible + if [ -w /etc/ld.so.conf.d ]; then + echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig + fi +# return value to calling script (Makefile,cmake) + echo $HOME_MKLDNN $HOME_MKLDNN +else + echo $MKLDNNROOT $MKLROOT +fi + From 28a7880febf8bb7485aab8ee18b5ddced1075087 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 16 Nov 2017 21:18:23 +0000 Subject: [PATCH 091/264] Enable MKLDNN activation. --- src/operator/nn/activation.cc | 40 +++++++---------- .../{mkldnn_relu-inl.h => mkldnn_act-inl.h} | 44 +++++++++++++------ 2 files changed, 47 insertions(+), 37 deletions(-) rename src/operator/nn/mkldnn/{mkldnn_relu-inl.h => mkldnn_act-inl.h} (71%) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index c9f80c1ac22a..6a6ebfddb3ff 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -27,7 +27,7 @@ #include "../mshadow_op.h" #include "../tensor/elemwise_unary_op.h" #if MXNET_USE_MKLDNN == 1 -#include "./mkldnn/mkldnn_relu-inl.h" +#include "./mkldnn/mkldnn_act-inl.h" #endif // MXNET_USE_MKLDNN namespace mxnet { @@ -58,14 +58,12 @@ static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (param.act_type == activation::kReLU) { - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNRelu_Forward(ctx, inputs[0], req[0], outputs[0]); - return; - default: - break; - } + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNAct_Forward(ctx, param, inputs[0], req[0], outputs[0]); + return; + default: + break; } #endif _ActivationCompute(param, ctx, inputs[0].data(), req[0], @@ -84,15 +82,13 @@ void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, #endif const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (param.act_type == activation::kReLU) { - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNRelu_Backward(ctx, inputs[0], inputs[1], req[0], - outputs[0]); - return; - default: - break; - } + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + MKLDNNAct_Backward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); + return; + default: + break; } #endif _ActivationGradCompute(param, ctx, inputs[0].data(), inputs[1].data(), @@ -108,9 +104,7 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (param.act_type == activation::kReLU - && dev_mask == mshadow::cpu::kDevMask) { - // TODO we don't know the type. + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -133,9 +127,7 @@ inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (param.act_type == activation::kReLU - && dev_mask == mshadow::cpu::kDevMask) { - // TODO we don't know the type. + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; diff --git a/src/operator/nn/mkldnn/mkldnn_relu-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h similarity index 71% rename from src/operator/nn/mkldnn/mkldnn_relu-inl.h rename to src/operator/nn/mkldnn/mkldnn_act-inl.h index 25ad61a5d68c..b368913a61a3 100644 --- a/src/operator/nn/mkldnn/mkldnn_relu-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -18,13 +18,13 @@ */ /*! - * \file mkldnn_relu-inl.h + * \file mkldnn_act-inl.h * \brief * \author Da Zheng */ -#ifndef MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ -#define MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ +#ifndef MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ +#define MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ #include @@ -45,20 +45,37 @@ namespace mxnet { namespace op { +static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { + switch (param.act_type) { + case activation::kReLU: + return mkldnn::algorithm::eltwise_relu; + case activation::kSigmoid: + return mkldnn::algorithm::eltwise_logistic; + case activation::kTanh: + return mkldnn::algorithm::eltwise_tanh; + case activation::kSoftReLU: + return mkldnn::algorithm::eltwise_soft_relu; + default: + LOG(FATAL) << "unknown activation type"; + return mkldnn::algorithm::eltwise_relu; + } +} + template -void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, - const OpReqType &req, const NDArray &out_data) { +void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, + const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { std::shared_ptr input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; + auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc desc = ctx.is_train ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, - mkldnn::eltwise_relu, data_md, alpha) + alg, data_md, alpha) : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, - mkldnn::eltwise_relu, data_md, alpha); + alg, data_md, alpha); mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); std::shared_ptr output_memory @@ -69,9 +86,9 @@ void MKLDNNRelu_Forward(const OpContext &ctx, const NDArray &in_data, } template -void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, - const NDArray &in_data, const OpReqType &req, - const NDArray &in_grad) { +void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, + const NDArray &out_grad, const NDArray &in_data, const OpReqType &req, + const NDArray &in_grad) { if (req == kNullOp) { return; } @@ -84,10 +101,11 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; + auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, - mkldnn::eltwise_relu, data_md, alpha); + alg, data_md, alpha); mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); - mkldnn::eltwise_backward::desc bw_desc(mkldnn::eltwise_relu, diff_md, data_md, alpha); + mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); auto diff_src_memory = CreateMKLDNNMem(in_grad, bw_pdesc.diff_src_primitive_desc(), req); @@ -102,4 +120,4 @@ void MKLDNNRelu_Backward(const OpContext &ctx, const NDArray &out_grad, } // namespace mxnet #endif -#endif // MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_ +#endif // MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ From 6f198749b8b6818a4168741966c014a6f9b39384 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 01:47:30 +0000 Subject: [PATCH 092/264] Fix a bug on FullyConnected. --- src/operator/nn/fully_connected.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 1178c0729bd8..7f4e9796b024 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -226,6 +226,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. .add_arguments(FullyConnectedParam::__FIELDS__()); NNVM_REGISTER_OP(_backward_FullyConnected) +.set_num_inputs(3) .set_num_outputs([](const NodeAttrs& attrs) { const FullyConnectedParam& params = nnvm::get(attrs.parsed); return params.no_bias ? 2 : 3; From 873cae94c7f2e7e1434b76e1218a9185fe261aba Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 02:19:40 +0000 Subject: [PATCH 093/264] Handle 3 dims for MKLDNN NDArray. --- src/ndarray/ndarray.cc | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e6186ed10ca9..2c75f6babef8 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -399,15 +399,27 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { return; } - mkldnn::memory::dims dims(shape.ndim()); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = shape[i]; + mkldnn::memory::dims dims; + // These are shapes supprted by MKLDNN. + if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4) { + dims.resize(shape.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = shape[i]; + } + // If there are 3 dimensions, we'll force it to 4 dimensions. + else if (shape.ndim() == 3) { + dims.resize(shape.ndim() + 1); + dims[0] = 1; + for (size_t i = 0; i < shape.ndim(); i++) + dims[i + 1] = shape[i]; + } + else + LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; mkldnn::memory::format layout = mkldnn::memory::format::format_undef; - switch (shape.ndim()) { + switch (dims.size()) { case 1: layout = mkldnn::memory::format::x; break; case 2: layout = mkldnn::memory::format::nc; break; case 4: layout = mkldnn::memory::format::nchw; break; - default: LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; } mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; auto cpu_engine = CpuEngine::Instance().get_engine(); From 1aab48f3b854dbc0e1d7bd9f9e120e224e50295a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 18:15:27 +0000 Subject: [PATCH 094/264] Fix a bug in MKLDNN FC. --- src/operator/nn/mkldnn/mkldnn_fully_connected.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 2a9e1ba4f7d8..9febc080729b 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -90,9 +90,9 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const TShape& ishape = in_data[fullc::kData].shape(); NDArray weight = in_data[fullc::kWeight]; NDArray data = in_data[fullc::kData]; - if (data.shape().ndim() > 2 && !param.flatten) + if (data.shape().ndim() != 2 && !param.flatten) data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); - else if (data.shape().ndim() > 2) + else if (data.shape().ndim() != 2) data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, @@ -123,14 +123,14 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, NDArray weight = inputs[fullc::kWeight + 1]; NDArray data = inputs[fullc::kData + 1]; - if (data.shape().ndim() > 2 && !param.flatten) + if (data.shape().ndim() != 2 && !param.flatten) data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); - else if (data.shape().ndim() > 2) + else if (data.shape().ndim() != 2) data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); NDArray out_grad = inputs[fullc::kOut]; - if (out_grad.shape().ndim() > 2 && !param.flatten) + if (out_grad.shape().ndim() != 2 && !param.flatten) out_grad = out_grad.Reshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1])); - else if (out_grad.shape().ndim() > 2) + else if (out_grad.shape().ndim() != 2) out_grad = out_grad.Reshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim()))); mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, From e280827d543de3ae2c94ab9eaca93880393694c1 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 18:36:52 +0000 Subject: [PATCH 095/264] Support MKLDNN storage in KV store. --- src/kvstore/kvstore_local.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 1bb84fdc1114..41b5b3030dd8 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -256,7 +256,13 @@ class KVStoreLocal : public KVStore { auto validator = [this](const int key, const NDArray& nd) -> bool { auto stype = nd.storage_type(); // valid NDArray - if (stype == kDefaultStorage || stype == kRowSparseStorage) return true; + if (stype == kDefaultStorage || stype == kRowSparseStorage + // When it's kMKLDNNStorage, it'll be converted to a data layout + // compatible to the default storage. +#if MXNET_USE_MKLDNN == 1 + || stype == kMKLDNNStorage +#endif + ) return true; // invalid NDArray, abort LOG(FATAL) << "Unexpected storage type detected during kvstore push: " << stype; return false; @@ -272,8 +278,15 @@ class KVStoreLocal : public KVStore { std::vector> *grouped_vals) { // check if the storage type of a value is valid auto validator = [this](const int key, const NDArray* nd) -> bool { + auto stype = nd->storage_type(); // valid - if (nd->storage_type() == kDefaultStorage) return true; + if (stype == kDefaultStorage + // When it's kMKLDNNStorage, it'll be converted to a data layout + // compatible to the default storage. +#if MXNET_USE_MKLDNN == 1 + || stype == kMKLDNNStorage +#endif + ) return true; // invalid, print warning messages once if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) { LOG(INFO) << "Warning: non-default weights detected during kvstore pull. " From b76418e04b3d1718b4a3913af5855c7c085c353e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 19:11:59 +0000 Subject: [PATCH 096/264] Fix a bug in executor for non-default NDArray. --- src/executor/graph_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 6afb1a6f8e25..bf620c832604 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1214,7 +1214,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const NDArray& src = data_pool_.at(storage_id); data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { - data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], + true, vdtype[i]); } if (log_verbose_) { LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type); From 645e9ae83280102d803de994869ff57d2cfc1efa Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 29 Nov 2017 23:33:10 +0000 Subject: [PATCH 097/264] Fix a link error in cast_storage.cc. --- src/operator/tensor/cast_storage.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 9d6e2ec20759..0ef401893399 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -46,7 +46,7 @@ static inline int get_type_size(int dtype) { } void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) { - CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU); CHECK(src.shape() == dns->shape_); CHECK_EQ(src.dtype(), dns->type_flag_); // This converts the source data to the default format and copy the data to @@ -56,7 +56,7 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) } void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { - CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), cpu::kDevMask); + CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU); CHECK(dst.shape() == src.shape()); CHECK_EQ(dst.dtype(), src.dtype()); From 706ae04ffca70c2d76e7d540f0f58e12a9f3bb38 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 19:13:13 +0000 Subject: [PATCH 098/264] Remove unnecessary function def --- src/operator/tensor/cast_storage.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 0ef401893399..0ba1efc700de 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -31,14 +31,6 @@ namespace mxnet { namespace op { #if MXNET_USE_MKLDNN == 1 -static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { - switch(dtype) { - case mshadow::kFloat32: - return mkldnn::memory::data_type::f32; - default: - return mkldnn::memory::data_type::data_undef; - } -} static inline int get_type_size(int dtype) { MSHADOW_TYPE_SWITCH(dtype, DType, {return sizeof(DType);}); From 3a32cb9e5cab29db2c7d54b20d4ff79c43388680 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 21:49:49 +0000 Subject: [PATCH 099/264] Fall back to def storage if the type isn't supported by MKLDNN. --- src/ndarray/ndarray.cc | 23 +++++++++++++++++++---- src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 2c75f6babef8..e9e1cb683040 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -48,10 +48,22 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { -NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, +static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dtype) { +#if MXNET_USE_MKLDNN == 1 + // We can't always generate a MKLDNN storage. If MKLDNN can't support the data type, + // we'll have to fall back to the default storage. + if (stype == kMKLDNNStorage && !SupportMKLDNN(dtype)) + return kDefaultStorage; + else +#endif + return stype; +} + +NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context ctx, bool delay_alloc, int dtype, std::vector aux_types, std::vector aux_shapes, TShape storage_shape) : shape_(shape), - dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { + dtype_(dtype), storage_type_(DetermineSType(_stype, dtype)), entry_({nullptr, 0, 0}) { + NDArrayStorageType stype = DetermineSType(_stype, dtype); // Assign default aux types if not given if (aux_types.size() == 0 #if MXNET_USE_MKLDNN == 1 @@ -96,8 +108,11 @@ NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ct LOG(FATAL) << "Unknown storage type " << stype; } } - ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, - dtype, aux_types, aux_shapes); + if (stype == kDefaultStorage) + ptr_ = std::make_shared(shape, ctx, delay_alloc, dtype); + else + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); } void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 33b9884e6252..87d0faebccf6 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -85,6 +85,10 @@ struct data_type_enum { enum { type = mkldnn::memory::data_type::u8 }; }; +static inline bool SupportMKLDNN(int dtype) { + return dtype == mshadow::kFloat32; +} + static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { switch(dtype) { case mshadow::kFloat32: From 7cf820137470e1663eac3ebbb4028733ed528bca Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 17 Nov 2017 21:50:42 +0000 Subject: [PATCH 100/264] Use NDArray for MKLDNN in python. --- python/mxnet/ndarray/sparse.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index 396163fccb66..d1f5e91b2c8c 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -49,7 +49,6 @@ pass from ._internal import _set_ndarray_class from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .mkldnn import MKLNDArray from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR, _STORAGE_TYPE_MKLDNN from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT from .ndarray import zeros as _zeros_ndarray @@ -1140,7 +1139,7 @@ def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED): if stype == _STORAGE_TYPE_DEFAULT: return NDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_MKLDNN: - return MKLNDArray(handle, writable=False) + return NDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_CSR: return CSRNDArray(handle, writable=writable) elif stype == _STORAGE_TYPE_ROW_SPARSE: From d0f806f6d95b727f503def0cd1ddac309b96848d Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 20 Nov 2017 21:16:47 +0000 Subject: [PATCH 101/264] Reshape output of MKLDNN convolution. --- .../nn/mkldnn/mkldnn_fully_connected.cc | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 9febc080729b..9ed1f0da08a8 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -32,10 +32,9 @@ namespace op { inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( const NDArray &data, const NDArray &weight, const NDArray *bias, - const NDArray &output) { + const mkldnn::memory::desc &out_md) { auto data_md = GetMemDesc(data); auto weight_md = GetMemDesc(weight); - auto out_md = GetMemDesc(output); auto engine = CpuEngine::Instance().get_engine(); if (bias) { auto bias_md = GetMemDesc(*bias); @@ -88,15 +87,29 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &out_data) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = in_data[fullc::kData].shape(); + const TShape& oshape = out_data[fullc::kOut].shape(); NDArray weight = in_data[fullc::kWeight]; NDArray data = in_data[fullc::kData]; - if (data.shape().ndim() != 2 && !param.flatten) - data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); - else if (data.shape().ndim() != 2) + auto out_md = GetMemDesc(out_data[fullc::kOut]); + if (data.shape().ndim() != 2 && !param.flatten) { + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); + // TODO this can potentially be a problem when casting the type. + mkldnn::memory::dims out_dims{(int) oshape.ProdShape(0, oshape.ndim()-1), + (int) oshape[ishape.ndim()-1]}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), + mkldnn::memory::format::any); + } + else if (data.shape().ndim() != 2) { data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + // TODO this can potentially be a problem when casting the type. + mkldnn::memory::dims out_dims{(int) oshape[0], (int) oshape.ProdShape(1, oshape.ndim())}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), + mkldnn::memory::format::any); + } mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, - param.no_bias ? nullptr : &in_data[fullc::kBias], out_data[fullc::kOut]); + param.no_bias ? nullptr : &in_data[fullc::kBias], out_md); auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], @@ -124,7 +137,8 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, NDArray weight = inputs[fullc::kWeight + 1]; NDArray data = inputs[fullc::kData + 1]; if (data.shape().ndim() != 2 && !param.flatten) - data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); + data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); else if (data.shape().ndim() != 2) data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); NDArray out_grad = inputs[fullc::kOut]; @@ -134,7 +148,7 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, out_grad = out_grad.Reshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim()))); mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, - param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad); + param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad)); CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; if (req[fullc::kData]) { From 39f5820bba0d332dc3b4412b7f14f6da3627206a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 20 Nov 2017 21:32:58 +0000 Subject: [PATCH 102/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e9e1cb683040..ff9e7835d618 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -441,6 +441,8 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // If the storage type is the default type, we can just simply // reference to the memory for the default storage. if (storage_type == kDefaultStorage) { + if (shandle.dptr == nullptr) + CheckAndAlloc(); Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, cpu_engine), shandle.dptr)); } From 770050a161ad68e955ed26b37693782927d1bfe9 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 21 Nov 2017 19:57:20 +0000 Subject: [PATCH 103/264] Support more operations in MKLDNN NDArray. --- src/ndarray/ndarray.cc | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index ff9e7835d618..cad1a6c48c4e 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -341,8 +341,11 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) { } NDArray NDArray::At(index_t idx) const { - CHECK(storage_type() == kDefaultStorage) << "Storage type " - << storage_type() << " doesn't support At()"; + CHECK(storage_type() == kDefaultStorage +#if MXNET_USE_MKLDNN == 1 + || storage_type() == kMKLDNNStorage +#endif + ) << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -352,8 +355,11 @@ NDArray NDArray::At(index_t idx) const { } NDArray NDArray::AtWithRecord(index_t idx) { - CHECK(storage_type() == kDefaultStorage) - << "Storage type " << storage_type() << " doesn't support At()"; + CHECK(storage_type() == kDefaultStorage +#if MXNET_USE_MKLDNN == 1 + || storage_type() == kMKLDNNStorage +#endif + ) << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->SliceWithRecord(idx, idx+1); if (shape_.ndim() > 1) { return ret.ReshapeWithRecord(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -416,7 +422,8 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { mkldnn::memory::dims dims; // These are shapes supprted by MKLDNN. - if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4) { + if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4 + || shape.ndim() == 5) { dims.resize(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) dims[i] = shape[i]; @@ -429,12 +436,16 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { dims[i + 1] = shape[i]; } else - LOG(FATAL) << "Unsupported number of dimensions for MKLDNN"; + LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions"; mkldnn::memory::format layout = mkldnn::memory::format::format_undef; switch (dims.size()) { case 1: layout = mkldnn::memory::format::x; break; case 2: layout = mkldnn::memory::format::nc; break; case 4: layout = mkldnn::memory::format::nchw; break; + // TODO This isn't the right layout when the data has 5 dimensions in MXNet. + // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have + // a corresponding format. + case 5: layout = mkldnn::memory::format::goihw; break; } mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; auto cpu_engine = CpuEngine::Instance().get_engine(); From 23b5e194b09ca5619dc8edb217b619863a7adefd Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 21 Nov 2017 19:58:23 +0000 Subject: [PATCH 104/264] Fix a bug in deconvolution. --- src/operator/nn/deconvolution.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 13642e643342..4015eb64f4d2 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -283,9 +283,8 @@ inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 3 : 4; uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(in_attrs->size(), 3U); CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 From 4bc02903eb2b09d7328deeb8a26de1d244d114b4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 21 Nov 2017 20:00:26 +0000 Subject: [PATCH 105/264] Fix bugs in MKLDNN deconvolution. We still need to compute bias correctly. --- .../nn/mkldnn/mkldnn_deconvolution.cc | 134 ++++++++---------- 1 file changed, 56 insertions(+), 78 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 7e5daf6ed251..7e849fd44d49 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -31,29 +31,38 @@ namespace mxnet { namespace op { +static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) { + mkldnn::memory::dims dims(1); + // This is convolution on 4D data. The second dimension is the channel. + dims[0] = md.data.dims[1]; + return mkldnn::memory::desc(dims, + static_cast(md.data.data_type), + mkldnn::memory::format::any); +} + static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, - const mkldnn::memory::desc *bias_md, const mkldnn::memory::desc &out_md, + bool has_bias, const mkldnn::memory::desc &out_md, const mkldnn::engine &engine, const mkldnn::memory::dims &strides, - const mkldnn::memory::dims &padding) { - // TODO when dilate > 1 - if (bias_md == nullptr) { + const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) { + if (!has_bias) { mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, - padding, padding, mkldnn::padding_kind::zero); + dilates, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } else { + auto bias_md = GetBiasDesc(data_md); mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, - mkldnn::algorithm::convolution_direct, out_md, weights_md, - *bias_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md, + data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } } static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd( const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { + bool has_bias, const NDArray &output) { auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); @@ -68,28 +77,22 @@ static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (bias) { - auto bias_md = GetMemDesc(*bias); - auto bwd_pd = GetDeconvBwd_(data_md, weight_md, &bias_md, - out_md, engine, strides, padding); - // TODO when dilate > 1 - mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); - } - else { - auto bwd_pd = GetDeconvBwd_(data_md, weight_md, nullptr, out_md, engine, - strides, padding); - // TODO when dilate > 1 - mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); - } + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; + } + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, + strides, padding, dilate); + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, dilate, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); } static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { + bool has_bias, const NDArray &output) { auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); @@ -104,20 +107,18 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - // TODO dilate - if (bias) { - auto bias_md = GetMemDesc(*bias); - return GetDeconvBwd_(data_md, weight_md, &bias_md, out_md, - engine, strides, padding); + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; } - else - return GetDeconvBwd_(data_md, weight_md, nullptr, out_md, - engine, strides, padding); + return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, + strides, padding, dilate); } static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output, + bool has_bias, const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) { auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); @@ -133,38 +134,23 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( padding[0] = param.pad[0]; padding[1] = param.pad[1]; } - if (/*param.dilate.ndim() == 0 &&*/ bias == nullptr) { + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; + } + if (!has_bias) { mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, data_md, strides, padding, padding, mkldnn::padding_kind::zero); + out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } - else /*if (param.dilate.ndim() == 0)*/ { - auto bias_md = GetMemDesc(*bias); + else { + auto bias_md = GetBiasDesc(data_md); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, bias_md, data_md, strides, padding, padding, + out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } -// else { -// // TODO I should test the case with dilate. -// mkldnn::memory::dims dilates{0, 0}; -// if (param.dilate.ndim() == 2) { -// dilates[0] = param.dilate[0]; -// dilates[1] = param.dilate[1]; -// } -// if (bias_mem == nullptr) { -// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, -// data_md, weights_md, out_md, strides, dilates, padding, padding, -// mkldnn::padding_kind::zero); -// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); -// } -// else { -// mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, -// data_md, weights_md, bias_mem->get_primitive_desc().desc(), out_md, -// strides, dilates, padding, padding, mkldnn::padding_kind::zero); -// return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); -// } -// } } void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -173,8 +159,8 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & const DeconvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( - param, in_data[deconv::kData], in_data[deconv::kWeight], - param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); + param, in_data[deconv::kData], in_data[deconv::kWeight], false, + out_data[deconv::kOut]); auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( deconvFwd_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(in_data[deconv::kWeight], @@ -205,7 +191,7 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext const DeconvolutionParam& param = nnvm::get(attrs.parsed); CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( - param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], nullptr, + param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false, inputs[deconv::kOut]); if (req[deconv::kData]) { auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( @@ -221,28 +207,20 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext if (req[deconv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], - inputs[deconv::kWeight + 1], - param.no_bias ? nullptr : &inputs[deconv::kWeight + 1], - inputs[deconv::kOut], bwdData_pd); + inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd); auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.diff_dst_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); - mkldnn_output_t in_grad_bias; - if (param.no_bias) { - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); - } else { - in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], - bwdWeights_pd.diff_bias_primitive_desc(), req[deconv::kBias]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second, - *in_grad_bias.second)); - } + MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); CommitOutput(in_grad[deconv::kWeight], in_grad_weight); - CommitOutput(in_grad[deconv::kBias], in_grad_bias); +// if (!param_.no_bias) { +// Tensor gbias = in_grad[deconv::kBias].get(s); +// Assign(gbias, req[deconv::kBias], sumall_except_dim<1>(grad)); +// } } MKLDNNStream::Instance().Submit(); } From ea3b24c5a2b3346c1416bf39d2c4468895f8580c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 21 Nov 2017 20:01:24 +0000 Subject: [PATCH 106/264] Have elemwise binary ops to fall to default for MKLDNN. --- .../tensor/elemwise_binary_scalar_op_basic.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index 9a278d8c97a0..8d2c4102684a 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -53,7 +53,11 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a std::vector* in_attrs, std::vector* out_attrs) { bool dispatched = false; - if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { + if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage, +#if MXNET_USE_MKLDNN == 1 + kMKLDNNStorage, nullptr +#endif + )) { dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage, dispatch_mode, @@ -81,7 +85,11 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs, const auto in_stype = in_attrs->at(0); auto &out_stype = out_attrs->at(0); bool dispatched = false; - if (!dispatched && in_stype == kDefaultStorage) { + if (!dispatched && (in_stype == kDefaultStorage +#if MXNET_USE_MKLDNN == 1 + || in_stype == kMKLDNNStorage +#endif + )) { // dns -> dns dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); From d8f46728f3f317e4ff7c6832b41b069f2041b14c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 21 Nov 2017 20:21:37 +0000 Subject: [PATCH 107/264] Limit the cases that MKLDNN operations are called. --- src/ndarray/ndarray.cc | 8 ++++---- src/operator/nn/activation.cc | 21 ++++++++------------- src/operator/nn/convolution.cc | 15 +++++++-------- src/operator/nn/deconvolution.cc | 15 +++++++-------- src/operator/nn/fully_connected.cc | 15 +++++++-------- src/operator/nn/mkldnn/mkldnn_base-inl.h | 13 +++++++++++-- 6 files changed, 44 insertions(+), 43 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index cad1a6c48c4e..c0027277a90a 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -48,11 +48,11 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { -static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dtype) { +static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dtype, const TShape &shape) { #if MXNET_USE_MKLDNN == 1 // We can't always generate a MKLDNN storage. If MKLDNN can't support the data type, // we'll have to fall back to the default storage. - if (stype == kMKLDNNStorage && !SupportMKLDNN(dtype)) + if (stype == kMKLDNNStorage && !SupportMKLDNN(dtype, shape)) return kDefaultStorage; else #endif @@ -62,8 +62,8 @@ static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dt NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context ctx, bool delay_alloc, int dtype, std::vector aux_types, std::vector aux_shapes, TShape storage_shape) : shape_(shape), - dtype_(dtype), storage_type_(DetermineSType(_stype, dtype)), entry_({nullptr, 0, 0}) { - NDArrayStorageType stype = DetermineSType(_stype, dtype); + dtype_(dtype), storage_type_(DetermineSType(_stype, dtype, shape)), entry_({nullptr, 0, 0}) { + NDArrayStorageType stype = DetermineSType(_stype, dtype, shape); // Assign default aux types if not given if (aux_types.size() == 0 #if MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 6a6ebfddb3ff..b9411e3b244d 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -27,6 +27,7 @@ #include "../mshadow_op.h" #include "../tensor/elemwise_unary_op.h" #if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_base-inl.h" #include "./mkldnn/mkldnn_act-inl.h" #endif // MXNET_USE_MKLDNN @@ -58,12 +59,9 @@ static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNAct_Forward(ctx, param, inputs[0], req[0], outputs[0]); - return; - default: - break; + if (SupportMKLDNN(inputs[0])) { + MKLDNNAct_Forward(ctx, param, inputs[0], req[0], outputs[0]); + return; } #endif _ActivationCompute(param, ctx, inputs[0].data(), req[0], @@ -82,13 +80,10 @@ void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, #endif const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNAct_Backward(ctx, param, inputs[0], inputs[1], req[0], - outputs[0]); - return; - default: - break; + if (SupportMKLDNN(inputs[0])) { + MKLDNNAct_Backward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); + return; } #endif _ActivationGradCompute(param, ctx, inputs[0].data(), inputs[1].data(), diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 3b3a2cdc963d..e748ad0ea32a 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -27,6 +27,7 @@ #include "./convolution-inl.h" #include "../elemwise_op_common.h" #include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK @@ -51,10 +52,9 @@ static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNNConv(inputs[0])) { + MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. @@ -71,10 +71,9 @@ static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNNConv(inputs[0])) { + MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 4015eb64f4d2..19d5e915fb01 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -26,6 +26,7 @@ #include "./deconvolution-inl.h" #include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -305,10 +306,9 @@ static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNDeconvolution_Forward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNNConv(inputs[0])) { + MKLDNNDeconvolution_Forward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. @@ -325,10 +325,9 @@ static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNDeconvolution_Backward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNNConv(inputs[0])) { + MKLDNNDeconvolution_Backward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 7f4e9796b024..b2281696fc93 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -24,6 +24,7 @@ */ #include "./fully_connected-inl.h" #include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_fully_connected-inl.h" #endif // MXNET_USE_NNPACK @@ -76,10 +77,9 @@ void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector &inputs, const std::vector &req, const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNFC_Forward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNN(inputs[0])) { + MKLDNNFC_Forward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. @@ -96,10 +96,9 @@ void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNFC_Backward(attrs, ctx, inputs, req, outputs); - return; + if (SupportMKLDNN(inputs[0])) { + MKLDNNFC_Backward(attrs, ctx, inputs, req, outputs); + return; } #endif // TODO I need to convert format. diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 87d0faebccf6..5d21fa72ae9b 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -85,8 +85,17 @@ struct data_type_enum { enum { type = mkldnn::memory::data_type::u8 }; }; -static inline bool SupportMKLDNN(int dtype) { - return dtype == mshadow::kFloat32; +static inline bool SupportMKLDNN(int dtype, const TShape &shape) { + int ndim = shape.ndim(); + return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); +} + +static inline bool SupportMKLDNN(const NDArray &input) { + return SupportMKLDNN(input.dtype(), input.shape()); +} + +static inline bool SupportMKLDNNConv(const NDArray &input) { + return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4; } static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { From 68add39b9b3a239e0d26f1ecbecc66a5abd6f160 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 22 Nov 2017 00:46:50 +0000 Subject: [PATCH 108/264] Force the layout of mkldnn::memory from NDArray. --- src/ndarray/ndarray.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index c0027277a90a..f27335fa554c 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -486,12 +486,9 @@ std::shared_ptr NDArray::GetMKLDNNData( auto desc2 = _desc.desc(); // The MKL memory has the same format and shape as required, // or both use the default format, we can return the MKL memory. - if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_; - } - else if (desc1.data.format == GetDefaultFormat(desc1) - && desc2.data.format == GetDefaultFormat(desc2)) { + if (ptr_->Mkl_mem_->get_primitive_desc() == desc + || (desc1.data.format == GetDefaultFormat(desc1) + && desc2.data.format == GetDefaultFormat(desc2))) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); MKLDNNStream::Instance().RegisterMem(ret); From e09ee058d7749aadfa653cede48f9074d7a6c2d3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 22 Nov 2017 19:14:00 +0000 Subject: [PATCH 109/264] Add MKLDNN softmax. --- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 4 ++ src/operator/nn/mkldnn/mkldnn_softmax.cc | 55 ++++++++++++++++++++++++ src/operator/nn/softmax.cc | 43 ++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 src/operator/nn/mkldnn/mkldnn_softmax.cc diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 710e439515f8..5866d7d5a917 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -63,6 +63,10 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext const std::vector& inputs, const std::vector& req, const std::vector& outputs); +/* For softmax */ +void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, const NDArray &out_data); + } } #endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc new file mode 100644 index 000000000000..1cf965915489 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_softmax.cc + * \brief + * \author Da Zheng +*/ + +#include "../softmax-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { + const SoftmaxParam& param = nnvm::get(attrs.parsed); + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + auto prop = ctx.is_train + ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop, + data_md, param.axis); + mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine); + + std::shared_ptr output_memory = out_data.GetMKLDNNData(); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); + stream.Submit(); +} + +} +} +#endif diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 4686fb8c0dc1..bf2059a43b07 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -25,11 +25,52 @@ #include "./softmax-inl.h" #include "../tensor/elemwise_unary_op.h" #include "../tensor/elemwise_binary_op.h" +#include "mkldnn/mkldnn_base-inl.h" +#include "mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(SoftmaxParam); +static void SoftmaxCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + // It seems MKLDNN softmax doesn't support training. + if (SupportMKLDNN(inputs[0]) && !ctx.is_train) { + MKLDNNSoftmax_Forward(attrs, ctx, inputs[0], req[0], outputs[0]); + return; + } +#endif + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + SoftmaxCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kDefaultStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + MXNET_OPERATOR_REGISTER_UNARY(softmax) .describe(R"code(Applies the softmax function. @@ -54,6 +95,8 @@ Example:: )code" ADD_FILELINE) .set_attr_parser(ParamParser) .set_attr("FCompute", SoftmaxCompute) +.set_attr("FComputeEx", SoftmaxCompute_CPU) +.set_attr("FInferStorageType", SoftmaxStorageType) .set_attr("FGradient", ElemwiseGradUseOut{"_backward_softmax"}) .add_arguments(SoftmaxParam::__FIELDS__()); From b6a6ea2736b2ea15ffad6fec2bced9402e56e47e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 22 Nov 2017 23:18:31 +0000 Subject: [PATCH 110/264] Fix output storage type of MKLDNN softmax. --- src/operator/nn/softmax.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index bf2059a43b07..7a20761a2d5d 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -62,7 +62,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kDefaultStorage; + (*out_attrs)[0] = kMKLDNNStorage; return true; } #endif From 045d980575bbe3123780c2977351b1dbcea0f696 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 22 Nov 2017 23:19:53 +0000 Subject: [PATCH 111/264] Add MKLDNN sum. --- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 4 ++++ src/operator/nn/mkldnn/mkldnn_sum.cc | 23 +++++++++++++++++++- src/operator/tensor/elemwise_sum.cc | 28 +++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 5866d7d5a917..2d5513cc9854 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -67,6 +67,10 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data); +/* For sum */ +void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, const NDArray &out_data); + } } #endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index 61ec1bbc4199..5645b276656f 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -34,7 +34,7 @@ namespace op { void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, const mkldnn::memory &out) { std::vector input_pds(2); - std::vector scales(2); + std::vector scales(2); std::vector inputs; input_pds[0] = arr1.get_primitive_desc(); input_pds[1] = arr2.get_primitive_desc(); @@ -47,6 +47,27 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, MKLDNNStream::Instance().RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); } +void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, const NDArray &out_data) { + std::vector in_mems(inputs.size()); + std::vector in_prims; + std::vector in_pds(inputs.size()); + std::vector scales(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + in_mems[i] = inputs[i].GetMKLDNNData(); + in_prims.push_back(*in_mems[i]); + in_pds[i] = in_mems[i]->get_primitive_desc(); + scales[i] = 1; + } + mkldnn::sum::primitive_desc pdesc(scales, in_pds); + + std::shared_ptr output_memory + = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::sum(pdesc, in_prims, *output_memory)); + stream.Submit(); +} + } } #endif diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 041a0be00796..28428698212e 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -24,6 +24,7 @@ */ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -72,6 +73,22 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } +static inline bool ContainMKLStorage(const std::vector &storages) { + for (const auto& i : storages) { + if (i == kMKLDNNStorage) + return true; + } + return false; +} + +static inline bool ContainMKLStorage(const std::vector& inputs) { + for (const auto &i : inputs) { + if (i.storage_type() == kMKLDNNStorage) + return true; + } + return false; +} + bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, @@ -79,6 +96,13 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask && ContainMKLStorage(*in_attrs)) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif return ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, in_attrs, out_attrs); } @@ -99,6 +123,10 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, ResourceRequest(ResourceRequest::kTempSpace)); NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); +#if MXNET_USE_MKLDNN == 1 + } else if (ContainMKLStorage(inputs)) { + MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); +#endif } else { LOG(FATAL) << "Not implemented: " << operator_string(attrs, op_ctx, inputs, req, outputs); } From 852a106d703a2f3cc977ba6453225123d38ac8d1 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 30 Nov 2017 00:34:17 +0000 Subject: [PATCH 112/264] Fix a bug in elemwise sum. --- src/operator/tensor/elemwise_sum.cc | 36 ++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 28428698212e..73a8ae2f246b 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -73,22 +73,33 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } -static inline bool ContainMKLStorage(const std::vector &storages) { +static inline bool ContainStorage(const std::vector &storages, + NDArrayStorageType type) { for (const auto& i : storages) { - if (i == kMKLDNNStorage) + if (i == type) return true; } return false; } -static inline bool ContainMKLStorage(const std::vector& inputs) { +static inline bool ContainStorage(const std::vector& inputs, + NDArrayStorageType type) { for (const auto &i : inputs) { - if (i.storage_type() == kMKLDNNStorage) + if (i.storage_type() == type) return true; } return false; } +static inline bool ContainOnlyStorage(const std::vector& inputs, + NDArrayStorageType type) { + for (const auto &i : inputs) { + if (i.storage_type() != type) + return false; + } + return true; +} + bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, @@ -97,7 +108,8 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && ContainMKLStorage(*in_attrs)) { + if (dev_mask == mshadow::cpu::kDevMask + && ContainStorage(*in_attrs, kMKLDNNStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -124,9 +136,21 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 - } else if (ContainMKLStorage(inputs)) { + } else if (ContainStorage(inputs, kMKLDNNStorage)) { MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif + } else if (ContainOnlyStorage(inputs, kDefaultStorage)) { + // This case happens when we want to create an MKLDNN NDArray but the type + // or the shape isn't supported by MKLDNN. In this case, NDArray falls back + // to the default storage type and, thus, we have to handle the default + // storage in FComputeEx. + std::vector in_blobs(inputs.size()); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ElementWiseSumCompute(attrs, op_ctx, in_blobs, req, out_blobs); } else { LOG(FATAL) << "Not implemented: " << operator_string(attrs, op_ctx, inputs, req, outputs); } From 9ebb034c9daa9374080c73ebc6d1715ef1f7936b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 30 Nov 2017 02:06:55 +0000 Subject: [PATCH 113/264] Fix a bug in MKLDNN softmax. --- src/operator/nn/softmax.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 7a20761a2d5d..50c954656000 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -36,8 +36,10 @@ static void SoftmaxCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 + const SoftmaxParam& param = nnvm::get(attrs.parsed); // It seems MKLDNN softmax doesn't support training. - if (SupportMKLDNN(inputs[0]) && !ctx.is_train) { + // and it only supports non-negative axis. + if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) { MKLDNNSoftmax_Forward(attrs, ctx, inputs[0], req[0], outputs[0]); return; } From b38c466e3a9ee150f5bf023599d084be01dc8e39 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 30 Nov 2017 19:15:50 +0000 Subject: [PATCH 114/264] Fix a bug in imperative. Clean up dispatch modes. --- src/imperative/imperative_utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index e265cce28e38..528cd06c4bee 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -600,6 +600,7 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev } if (match) return true; } + g.attrs.erase("dispatch_mode"); g.attrs.erase("storage_type"); g.attrs.erase("storage_type_inputs"); if (node_range.second > node_range.first) { From d1544d88a4987704263027ebde3ed4133ba2ece3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 30 Nov 2017 21:53:20 +0000 Subject: [PATCH 115/264] Remove redundant code. --- src/executor/infer_graph_attr_pass.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 67e61aa357c2..3bf3c9b8545a 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -416,11 +416,6 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph, DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); } - // initialize unknown values for dispatch modes - if (graph.attrs.count("dispatch_mode") == 0) { - DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); - graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); - } // initialize the dev_mask vector from the context vector if (graph.attrs.count("dev_mask") == 0) { CHECK_GT(graph.attrs.count("context"), 0); From 2886818a50d6b4e37b1b728bab35d15392d55373 Mon Sep 17 00:00:00 2001 From: Ashok Emani Date: Tue, 21 Nov 2017 01:08:30 -0800 Subject: [PATCH 116/264] MKLDNN Pooling Op integration --- src/ndarray/ndarray.cc | 2 +- src/operator/nn/mkldnn/mkldnn_base-inl.h | 206 +++++++++-------- src/operator/nn/pooling.cc | 269 ++++++++++++++++------- 3 files changed, 309 insertions(+), 168 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f27335fa554c..d81866e4f2f1 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -403,7 +403,7 @@ void NDArray::set_fresh_out_grad(bool state) const { #if MXNET_USE_MKLDNN == 1 static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { - if (shape.ndim() != ndims) + if (shape.ndim() != (size_t)ndims) return false; for (int i = 0; i < ndims; i++) if (shape[i] != dims[i]) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 5d21fa72ae9b..c96fd5825224 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -27,62 +27,65 @@ #define MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ #if MXNET_USE_MKLDNN == 1 +#include #include +#include #include -#include #include "mkldnn.hpp" - +using namespace mkldnn; namespace mxnet { extern bool EnableMkldnnWarnGenerated(); // ===== CpuEngine ======================================= // cpu_engine singleton class CpuEngine { public: - static CpuEngine & Instance() { - // I's thread-safe in C++11. - static thread_local CpuEngine myInstance; - return myInstance; - } - CpuEngine(CpuEngine const&) = delete; // Copy construct - CpuEngine(CpuEngine&&) = delete; // Move construct - CpuEngine& operator=(CpuEngine const&) = delete; // Copy assign - CpuEngine& operator=(CpuEngine &&) = delete; // Move assign - - mkldnn::engine & get_engine() { return _cpu_engine; } + static CpuEngine &Instance() { + // I's thread-safe in C++11. + static thread_local CpuEngine myInstance; + return myInstance; + } + CpuEngine(CpuEngine const &) = delete; // Copy construct + CpuEngine(CpuEngine &&) = delete; // Move construct + CpuEngine &operator=(CpuEngine const &) = delete; // Copy assign + CpuEngine &operator=(CpuEngine &&) = delete; // Move assign + + mkldnn::engine &get_engine() { return _cpu_engine; } + protected: - CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} - ~CpuEngine() {} + CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} + ~CpuEngine() {} + private: - mkldnn::engine _cpu_engine; + mkldnn::engine _cpu_engine; }; // type enumerator -template +template struct data_type_enum {}; -template<> +template <> struct data_type_enum { - enum { type = mkldnn::memory::data_type::f32 }; + enum { type = mkldnn::memory::data_type::f32 }; }; -template<> +template <> struct data_type_enum { - enum { type = mkldnn::memory::data_type::s32 }; + enum { type = mkldnn::memory::data_type::s32 }; }; -template<> +template <> struct data_type_enum { - enum { type = mkldnn::memory::data_type::s16 }; + enum { type = mkldnn::memory::data_type::s16 }; }; -template<> +template <> struct data_type_enum { - enum { type = mkldnn::memory::data_type::s8 }; + enum { type = mkldnn::memory::data_type::s8 }; }; -template<> +template <> struct data_type_enum { - enum { type = mkldnn::memory::data_type::u8 }; + enum { type = mkldnn::memory::data_type::u8 }; }; static inline bool SupportMKLDNN(int dtype, const TShape &shape) { @@ -99,7 +102,7 @@ static inline bool SupportMKLDNNConv(const NDArray &input) { } static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { - switch(dtype) { + switch (dtype) { case mshadow::kFloat32: return mkldnn::memory::data_type::f32; default: @@ -109,10 +112,9 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) { mkldnn::memory::dims dims(ndim); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = arr.shape()[i]; + for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i]; return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), - mkldnn::memory::format::any}; + mkldnn::memory::format::any}; } inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { @@ -120,17 +122,16 @@ inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { } inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, - int num_groups) { + int num_groups) { if (num_groups == 1) { return GetMemDesc(arr); - } - else { + } else { CHECK_EQ(arr.shape().ndim(), 4U); - mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, - (int) arr.shape()[0] / num_groups, (int) arr.shape()[1], - (int) arr.shape()[2], (int) arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ + num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], + (int)arr.shape()[2], (int)arr.shape()[3]}; return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), - mkldnn::memory::format::any}; + mkldnn::memory::format::any}; } } @@ -141,19 +142,16 @@ class MKLDNNStream { std::vector net; // Here we hold all memory related to the operators in the stream. std::vector mem_holder; -public: + + public: static MKLDNNStream &Instance() { static thread_local MKLDNNStream stream; return stream; } - void RegisterPrim(const mkldnn::primitive &prim) { - net.push_back(prim); - } + void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } - void RegisterMem(mkldnn_mem_const_ptr mem) { - mem_holder.push_back(mem); - } + void RegisterMem(mkldnn_mem_const_ptr mem) { mem_holder.push_back(mem); } void Submit() { mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); @@ -162,7 +160,14 @@ class MKLDNNStream { } }; -inline static mkldnn_mem_ptr CreateMKLDNNMem(const mkldnn::memory::primitive_desc &desc) { +// some operators need to share workspace between fwd/bwd +inline std::unordered_map &mkldnn_wmap() { + static std::unordered_map _wmap; + return _wmap; +} + +inline static mkldnn_mem_ptr CreateMKLDNNMem( + const mkldnn::memory::primitive_desc &desc) { // TODO allocate memory more efficiently. std::shared_ptr ret(new mkldnn::memory(desc)); MKLDNNStream::Instance().RegisterMem(ret); @@ -177,8 +182,9 @@ enum OutDataOp { typedef std::pair mkldnn_output_t; -static inline mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, - const mkldnn::memory::primitive_desc &desc, OpReqType req) { +static inline mkldnn_output_t CreateMKLDNNMem( + const NDArray &arr, const mkldnn::memory::primitive_desc &desc, + OpReqType req) { if (kAddTo == req) return mkldnn_output_t(OutDataOp::AddBack, CreateMKLDNNMem(desc)); else { @@ -192,56 +198,64 @@ static inline mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, namespace op { void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, - const mkldnn::memory &out); + const mkldnn::memory &out); } -static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { +static inline void CommitOutput(const NDArray &arr, + const mkldnn_output_t &res) { if (res.first == CopyBack) const_cast(arr).CopyFrom(*res.second); else if (res.first == AddBack) { // TODO I might need to reorder. - mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + mkldnn_mem_const_ptr mem = + arr.GetMKLDNNData(res.second->get_primitive_desc()); CHECK(mem != nullptr); // We have to allocate new memory for the sum result. - mkldnn_mem_ptr sum_res(new mkldnn::memory(res.second->get_primitive_desc())); + mkldnn_mem_ptr sum_res( + new mkldnn::memory(res.second->get_primitive_desc())); MKLDNNStream::Instance().RegisterMem(sum_res); op::Sum(*res.second, *mem, *sum_res); const_cast(arr).CopyFrom(*sum_res); } } -inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, - const mkldnn::memory::primitive_desc &target_pd, int num_groups) { +inline static mkldnn_mem_const_ptr GetWeights( + const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, + int num_groups) { mkldnn_mem_const_ptr mem; mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Instance().get_engine(); if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], - (int) arr.shape()[1]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mkldnn::memory::dims tz = + mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); - } - else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], - (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + } else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = + mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1], + (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); - } - else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, - (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + } else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], + (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); - } - else { + } else { LOG(FATAL) << "The weight array has an unsupported number of dimensions"; return nullptr; } - if (mem->get_primitive_desc() == target_pd) - return mem; + if (mem->get_primitive_desc() == target_pd) return mem; std::shared_ptr ret = CreateMKLDNNMem(target_pd); MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); @@ -249,30 +263,36 @@ inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, } inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, - const mkldnn::engine &engine, int num_groups = 1) { + const mkldnn::engine &engine, + int num_groups = 1) { mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], - (int) arr.shape()[1]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + mkldnn::memory::dims tz = + mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); - } - else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{(int) arr.shape()[0], - (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + } else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = + mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1], + (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); - } - else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{num_groups, (int) arr.shape()[0] / num_groups, - (int) arr.shape()[1], (int) arr.shape()[2], (int) arr.shape()[3]}; - mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; + } else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], + (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); - } - else { + } else { LOG(FATAL) << "The weight array has an unsupported number of dimensions"; return nullptr; } diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 41ace3cecae0..ae40be3ea8b3 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -23,8 +23,8 @@ * \brief * \author Bing Xu, Jun Wu, Da Zheng */ -#include "./pooling-inl.h" #include "../elemwise_op_common.h" +#include "./pooling-inl.h" #if MXNET_USE_MKL2017 == 1 #include #include "../mkl/mkl_memory-inl.h" @@ -33,11 +33,14 @@ #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_pooling-inl.h" #endif // MXNET_USE_NNPACK +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_pooling-inl.h" +#endif // MXNET_USE_MKLDNN namespace mxnet { namespace op { -static void PoolingParamParser(nnvm::NodeAttrs* attrs) { +static void PoolingParamParser(nnvm::NodeAttrs *attrs) { using namespace mshadow; PoolingParam param_; param_.Init(attrs->dict); @@ -48,102 +51,125 @@ static void PoolingParamParser(nnvm::NodeAttrs* attrs) { if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() + << "D pooling not supported"; if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); } CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) - << "stride and kernel should have the same length"; + << "stride and kernel should have the same length"; CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) - << "pad and kernel should have the same length"; + << "pad and kernel should have the same length"; attrs->parsed = std::move(param_); } -static bool PoolingShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { - const PoolingParam& param_ = nnvm::get(attrs.parsed); +static bool PoolingShape(const nnvm::NodeAttrs &attrs, + std::vector *in_shape, + std::vector *out_shape) { + const PoolingParam ¶m_ = nnvm::get(attrs.parsed); CHECK_EQ(in_shape->size(), 1U); const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" - << " Or 4D in (batch, channel, y, x) " - << " Or 5D in (batch, channel, d, y, x)"; + CHECK_GE(dshape.ndim(), 3U) + << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; TShape oshape = dshape; - if (dshape.ndim() == 0) return false; + if (dshape.ndim() == 0) return false; if (param_.kernel.ndim() == 1) { - CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + CHECK_EQ(dshape.ndim(), 3U) + << "Pooling: Input data should be 3D in (batch, channel, x)"; if (param_.global_pool) { oshape[2] = 1; } else { CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + << "kernel size (" << param_.kernel[0] << ") exceeds input (" + << dshape[2] << " padded to " << (dshape[2] + 2 * param_.pad[0]) + << ")"; if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; + oshape[2] = 1 + + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / + param_.stride[0])); } } out_shape->clear(); out_shape->push_back(oshape); // save output shape } else if (param_.kernel.ndim() == 2) { - CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + CHECK_EQ(dshape.ndim(), 4U) + << "Pooling: Input data should be 4D in (batch, channel, y, x)"; if (param_.global_pool) { oshape[2] = 1; oshape[3] = 1; } else { CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + << "kernel size (" << param_.kernel[0] << ") exceeds input (" + << dshape[2] << " padded to " << (dshape[2] + 2 * param_.pad[0]) + << ")"; CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) - << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] - << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + << "kernel size (" << param_.kernel[1] << ") exceeds input (" + << dshape[3] << " padded to " << (dshape[3] + 2 * param_.pad[1]) + << ")"; if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; + oshape[2] = 1 + + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / + param_.stride[0])); + oshape[3] = 1 + static_cast(ceil( + static_cast(dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / + param_.stride[1])); } } out_shape->clear(); out_shape->push_back(oshape); // save output shape } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) - << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; - CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) + << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) + << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) + << "kernel size exceeds input"; if (param_.global_pool) { oshape[2] = 1; oshape[3] = 1; oshape[4] = 1; } else { if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / - param_.stride[2]; + oshape[2] = 1 + + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - oshape[4] = 1 + static_cast(ceil(static_cast( - dshape[4] + 2 * param_.pad[2] - - param_.kernel[2]) / param_.stride[2])); + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / + param_.stride[0])); + oshape[3] = 1 + static_cast(ceil( + static_cast(dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / + param_.stride[1])); + oshape[4] = 1 + static_cast(ceil( + static_cast(dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / + param_.stride[2])); } } @@ -153,10 +179,54 @@ static bool PoolingShape(const nnvm::NodeAttrs& attrs, return true; } +void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + const PoolingParam ¶m = nnvm::get(attrs.parsed); + MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0]); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + PoolingCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + switch (inputs[0].dtype()) { + case mshadow::kFloat32: + const PoolingParam ¶m = nnvm::get(attrs.parsed); + MKLDNNPooling_Backward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); + return; + } +#endif + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + PoolingGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + struct PoolingGrad { const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { + std::vector operator()( + const nnvm::NodePtr &n, + const std::vector &ograds) const { std::vector heads; heads.push_back(ograds[pool_enum::kOut]); heads.push_back(n->inputs[pool_enum::kData]); @@ -165,10 +235,52 @@ struct PoolingGrad { } }; +inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 3); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + DMLC_REGISTER_PARAMETER(PoolingParam); NNVM_REGISTER_OP(Pooling) -.describe(R"code(Performs pooling on the input. + .describe(R"code(Performs pooling on the input. The shapes for 1-D pooling are @@ -207,28 +319,37 @@ For 3-D pooling, an additional *depth* dimension is added before height, width)*. )code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(1) -.set_attr_parser(PoolingParamParser) -.set_attr("FInferType", ElemwiseType<1, 1>) -.set_attr("FInferShape", PoolingShape) -.set_attr("FCompute", PoolingCompute) -.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_Pooling"}) -.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") -.add_arguments(PoolingParam::__FIELDS__()); + .set_num_inputs(1) + .set_num_outputs(1) + .set_attr_parser(PoolingParamParser) + .set_attr("FInferStorageType", PoolingStorageType) + .set_attr("FInferType", ElemwiseType<1, 1>) + .set_attr("FInferShape", PoolingShape) + .set_attr("FCompute", PoolingCompute) + .set_attr("FComputeEx", PoolingCompute_CPU) + .set_attr("FGradient", + ElemwiseGradUseInOut{"_backward_Pooling"}) + .add_argument("data", "NDArray-or-Symbol", + "Input data to the pooling operator.") + .add_arguments(PoolingParam::__FIELDS__()); NNVM_REGISTER_OP(_backward_Pooling) -.set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + .set_num_outputs(1) + .set_attr("TIsBackward", true) + .set_attr( + "FInplaceOption", + [](const NodeAttrs &attrs) { #if MXNET_USE_CUDNN == 1 - return std::vector >(); + return std::vector >(); #else - return std::vector >{{1, 0}}; + return std::vector >{{1, 0}}; #endif -}) -.set_attr_parser(PoolingParamParser) -.set_attr("FCompute", PoolingGradCompute); + }) + .set_attr("FInferStorageType", + backward_PoolingStorageType) + .set_attr_parser(PoolingParamParser) + .set_attr("FCompute", PoolingGradCompute) + .set_attr("FComputeEx", PoolingGradCompute_CPU); } // namespace op } // namespace mxnet From 12136706bdd4e94b5c57083f3c906db06aaba594 Mon Sep 17 00:00:00 2001 From: Ashok Emani Date: Tue, 21 Nov 2017 01:11:07 -0800 Subject: [PATCH 117/264] MKLDNN Pooling Op integration add missing file --- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 177 ++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 src/operator/nn/mkldnn/mkldnn_pooling-inl.h diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h new file mode 100644 index 000000000000..9312599f8fad --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_pooling.cc + * \brief +*/ + +#if MXNET_USE_MKLDNN == 1 +#include +#include "../pooling-inl.h" +#include "./mkldnn_base-inl.h" + +namespace mxnet { +namespace op { + +static inline algorithm GetMKLDNNPoolAlgo(const PoolingParam ¶m) { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + return algorithm::pooling_max; + break; + case pool_enum::kAvgPooling: + return algorithm::pooling_avg; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + return algorithm::pooling_max; + } +} + +inline static pooling_forward::primitive_desc GetPoolingFwd( + const PoolingParam ¶m, bool is_train, const memory::desc &data_md, + const memory::desc &out_md) { + CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented"; + int kernel_h_, kernel_w_; + if (param.global_pool) { + kernel_h_ = data_md.data.dims[2]; + kernel_w_ = data_md.data.dims[3]; + } else { + kernel_h_ = param.kernel[0]; + kernel_w_ = param.kernel[1]; + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + + auto pad_t_ = param.pad[0], pad_b_ = param.pad[0]; + auto pad_l_ = param.pad[1], pad_r_ = param.pad[1]; + auto stride_h_ = param.stride[0], stride_w_ = param.stride[1]; + + auto engine = CpuEngine::Instance().get_engine(); + if (param.global_pool) { + CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + if (pad_t_ != 0 || pad_l_ != 0) { + CHECK(param.pool_type == pool_enum::kAvgPooling || + param.pool_type == pool_enum::kMaxPooling) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_l_, kernel_w_); + CHECK_LT(pad_t_, kernel_h_); + } + auto alg = GetMKLDNNPoolAlgo(param); + auto kind = prop_kind::forward_scoring; + if (is_train && alg != algorithm::pooling_avg) { + kind = prop_kind::forward_training; + } + pooling_forward::desc poolingFwd_desc( + kind, alg, data_md, out_md, {(int)stride_h_, (int)stride_w_}, + {kernel_h_, kernel_w_}, {(int)pad_t_, (int)pad_l_}, {(int)pad_b_, (int)pad_r_}, + padding_kind::zero); + return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine); +} + +template +void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto data_mpd = input_mem->get_primitive_desc(); + auto data_md = data_mpd.desc(); + + memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], + (int)out_data.shape()[2], (int)out_data.shape()[3]}; + memory::desc out_md({dims}, + static_cast(data_md.data.data_type), + static_cast(data_md.data.format)); + + auto pdesc = GetPoolingFwd(param, ctx.is_train, data_md, out_md); + + std::shared_ptr output_memory = + const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); + + if (ctx.is_train && param.pool_type != pool_enum::kAvgPooling) { + // TODO: reuse workspace_mem from 2nd iter + auto workspace_mem = CreateMKLDNNMem(pdesc.workspace_primitive_desc()); + mkldnn_wmap()[&in_data] = workspace_mem; + MKLDNNStream::Instance().RegisterPrim( + pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem)); + } else { + MKLDNNStream::Instance().RegisterPrim( + pooling_forward(pdesc, *input_mem, *output_memory)); + } + MKLDNNStream::Instance().Submit(); +} + +template +void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + std::shared_ptr diff_dst_mem = out_grad.GetMKLDNNData(); + std::shared_ptr input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], + (int)out_grad.shape()[2], (int)out_grad.shape()[3]}; + memory::desc out_md({dims}, + static_cast(data_md.data.data_type), + static_cast(data_md.data.format)); + auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md); + + mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc(); + memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1], + (int)in_grad.shape()[2], (int)in_grad.shape()[3]}; + memory::desc diff_in_md( + {dims1}, static_cast(diff_md.data.data_type), + static_cast(diff_md.data.format)); + auto cpu_engine = data_mpd.get_engine(); + + auto alg = GetMKLDNNPoolAlgo(param); + + pooling_backward::desc desc( + alg, diff_in_md, diff_md, {(int)param.stride[0], (int)param.stride[1]}, + {(int)param.kernel[0], (int)param.kernel[1]}, {(int)param.pad[0], (int)param.pad[1]}, + {(int)param.pad[0], (int)param.pad[1]}, padding_kind::zero); + pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd); + + auto diff_src_mem = + CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req); + + if (param.pool_type != pool_enum::kAvgPooling) { + // look-up workspace mem used for fwd + auto workspace_mem = mkldnn_wmap()[&in_data]; + // TODO: remove workspace_mem after submit + MKLDNNStream::Instance().RegisterPrim( + pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), + *diff_src_mem.second)); + } else { + MKLDNNStream::Instance().RegisterPrim( + pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second)); + } + CommitOutput(in_grad, diff_src_mem); + MKLDNNStream::Instance().Submit(); +} +} +} +#endif // MXNET_USE_MKLDNN == 1 From ecfb6b7600528829b235b3bd0f2a1b555dcdbd53 Mon Sep 17 00:00:00 2001 From: Ashok Emani Date: Thu, 23 Nov 2017 01:40:50 -0800 Subject: [PATCH 118/264] fix mkldnn pooling op workspace issue --- include/mxnet/ndarray.h | 3 +++ src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 ++-- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 8d51858f774f..4f4b998dc999 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -572,6 +572,9 @@ class NDArray { void CopyFrom(const mkldnn::memory &mem); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); + inline const void *getPtr() const{ + return static_cast(ptr_.get()); + } #endif /*! diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index c96fd5825224..33df6f8b4c53 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -161,8 +161,8 @@ class MKLDNNStream { }; // some operators need to share workspace between fwd/bwd -inline std::unordered_map &mkldnn_wmap() { - static std::unordered_map _wmap; +inline std::unordered_map &mkldnn_wmap() { + static std::unordered_map _wmap; return _wmap; } diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 9312599f8fad..e813ea77a3dd 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -110,7 +110,7 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, if (ctx.is_train && param.pool_type != pool_enum::kAvgPooling) { // TODO: reuse workspace_mem from 2nd iter auto workspace_mem = CreateMKLDNNMem(pdesc.workspace_primitive_desc()); - mkldnn_wmap()[&in_data] = workspace_mem; + mkldnn_wmap()[in_data.getPtr()] = workspace_mem; MKLDNNStream::Instance().RegisterPrim( pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem)); } else { @@ -160,7 +160,7 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, if (param.pool_type != pool_enum::kAvgPooling) { // look-up workspace mem used for fwd - auto workspace_mem = mkldnn_wmap()[&in_data]; + auto workspace_mem = mkldnn_wmap()[in_data.getPtr()]; // TODO: remove workspace_mem after submit MKLDNNStream::Instance().RegisterPrim( pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), From adbec7a91badabcb7c95206c51ed54c8a29179d7 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 1 Dec 2017 01:38:38 +0000 Subject: [PATCH 119/264] handle workspace in MKLDNN pooling correctly. --- include/mxnet/ndarray.h | 3 - src/operator/nn/mkldnn/mkldnn_base-inl.h | 15 ++-- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 27 +++---- src/operator/nn/pooling.cc | 80 ++++++++++++++++++--- 4 files changed, 93 insertions(+), 32 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 4f4b998dc999..8d51858f774f 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -572,9 +572,6 @@ class NDArray { void CopyFrom(const mkldnn::memory &mem); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); - inline const void *getPtr() const{ - return static_cast(ptr_.get()); - } #endif /*! diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 33df6f8b4c53..2ce19a451501 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -90,7 +90,7 @@ struct data_type_enum { static inline bool SupportMKLDNN(int dtype, const TShape &shape) { int ndim = shape.ndim(); - return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); + return ndim == 1 || ndim == 2 || ndim == 4; } static inline bool SupportMKLDNN(const NDArray &input) { @@ -105,7 +105,14 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { switch (dtype) { case mshadow::kFloat32: return mkldnn::memory::data_type::f32; + case mshadow::kInt32: + return mkldnn::memory::data_type::s32; + case mshadow::kInt8: + return mkldnn::memory::data_type::s8; + case mshadow::kUint8: + return mkldnn::memory::data_type::u8; default: + LOG(FATAL) << "unknown type for MKLDNN"; return mkldnn::memory::data_type::data_undef; } } @@ -160,12 +167,6 @@ class MKLDNNStream { } }; -// some operators need to share workspace between fwd/bwd -inline std::unordered_map &mkldnn_wmap() { - static std::unordered_map _wmap; - return _wmap; -} - inline static mkldnn_mem_ptr CreateMKLDNNMem( const mkldnn::memory::primitive_desc &desc) { // TODO allocate memory more efficiently. diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index e813ea77a3dd..f3f4c68f13e6 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -87,10 +87,13 @@ inline static pooling_forward::primitive_desc GetPoolingFwd( return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine); } -template +inline bool MKLDNNRequireWorkspace(const PoolingParam ¶m) { + return param.pool_type != pool_enum::kAvgPooling; +} + void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { + const NDArray &out_data, const NDArray *workspace) { std::shared_ptr input_mem = in_data.GetMKLDNNData(); auto data_mpd = input_mem->get_primitive_desc(); auto data_md = data_mpd.desc(); @@ -106,11 +109,11 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, std::shared_ptr output_memory = const_cast(out_data).CreateMKLDNNData( pdesc.dst_primitive_desc()); + std::shared_ptr workspace_mem; - if (ctx.is_train && param.pool_type != pool_enum::kAvgPooling) { - // TODO: reuse workspace_mem from 2nd iter - auto workspace_mem = CreateMKLDNNMem(pdesc.workspace_primitive_desc()); - mkldnn_wmap()[in_data.getPtr()] = workspace_mem; + if (ctx.is_train && MKLDNNRequireWorkspace(param)) { + CHECK(workspace != nullptr); + workspace_mem = workspace->GetMKLDNNData(); MKLDNNStream::Instance().RegisterPrim( pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem)); } else { @@ -120,10 +123,10 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, MKLDNNStream::Instance().Submit(); } -template void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, const NDArray &out_grad, const NDArray &in_data, - const OpReqType &req, const NDArray &in_grad) { + const NDArray *workspace, const OpReqType &req, + const NDArray &in_grad) { if (req == kNullOp) { return; } @@ -157,11 +160,11 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, auto diff_src_mem = CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req); + std::shared_ptr workspace_mem; - if (param.pool_type != pool_enum::kAvgPooling) { - // look-up workspace mem used for fwd - auto workspace_mem = mkldnn_wmap()[in_data.getPtr()]; - // TODO: remove workspace_mem after submit + if (MKLDNNRequireWorkspace(param)) { + CHECK(workspace != nullptr); + workspace_mem = workspace->GetMKLDNNData(); MKLDNNStream::Instance().RegisterPrim( pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), *diff_src_mem.second)); diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index ae40be3ea8b3..f31bf0cbf6a3 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -63,6 +63,20 @@ static void PoolingParamParser(nnvm::NodeAttrs *attrs) { attrs->parsed = std::move(param_); } +static bool PoolingType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + out_attrs->at(0) = in_attrs->at(0); +#if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + if (MKLDNNRequireWorkspace(param)) { + CHECK_GT(out_attrs->size(), 1U); + out_attrs->at(1) = mshadow::kFloat32; + } +#endif + return true; +} + static bool PoolingShape(const nnvm::NodeAttrs &attrs, std::vector *in_shape, std::vector *out_shape) { @@ -98,6 +112,10 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, } out_shape->clear(); out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param_)) + out_shape->push_back(oshape); // for workspace +#endif } else if (param_.kernel.ndim() == 2) { CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; @@ -133,6 +151,10 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, } out_shape->clear(); out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param_)) + out_shape->push_back(oshape); // for workspace +#endif } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; @@ -175,6 +197,10 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->clear(); out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param_)) + out_shape->push_back(oshape); // for workspace +#endif } return true; } @@ -184,10 +210,16 @@ void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &req, const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + const NDArray *workspace = nullptr; + if (MKLDNNRequireWorkspace(param)) { + CHECK_GT(outputs.size(), 1U); + workspace = &outputs[1]; + } switch (inputs[0].dtype()) { case mshadow::kFloat32: - const PoolingParam ¶m = nnvm::get(attrs.parsed); - MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0]); + MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0], + workspace); return; } #endif @@ -205,11 +237,27 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &req, const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + const NDArray &out_grad = inputs[0]; + const NDArray *workspace = nullptr; + const NDArray *in_data = nullptr; + if (MKLDNNRequireWorkspace(param)) { + // The first two elements are the gradient of the outputs in forward. + // The third is the input of forward. + // The fourth and the fifth are the outputs of forward. + CHECK_EQ(inputs.size(), 5U); + in_data = &inputs[2]; + workspace = &inputs[4]; + } + else { + CHECK_EQ(inputs.size(), 3U); + in_data = &inputs[1]; + } + const NDArray &in_grad = outputs[0]; switch (inputs[0].dtype()) { case mshadow::kFloat32: - const PoolingParam ¶m = nnvm::get(attrs.parsed); - MKLDNNPooling_Backward(ctx, param, inputs[0], inputs[1], req[0], - outputs[0]); + MKLDNNPooling_Backward(ctx, param, out_grad, *in_data, workspace, + req[0], in_grad); return; } #endif @@ -241,15 +289,16 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, std::vector *in_attrs, std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; return true; } #endif + CHECK_EQ(out_attrs->size(), 1); *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; @@ -260,7 +309,6 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, DispatchMode *dispatch_mode, std::vector *in_attrs, std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 3); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 @@ -271,6 +319,7 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, return true; } #endif + CHECK_EQ(in_attrs->size(), 3); *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; @@ -320,10 +369,21 @@ height, width)*. )code" ADD_FILELINE) .set_num_inputs(1) - .set_num_outputs(1) + .set_num_outputs([](const NodeAttrs& attrs) { +#if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + return MKLDNNRequireWorkspace(param) ? 2 : 1; +#else + return 1; +#endif + }) +#if MXNET_USE_MKLDNN == 1 + .set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { return 1; }) +#endif .set_attr_parser(PoolingParamParser) .set_attr("FInferStorageType", PoolingStorageType) - .set_attr("FInferType", ElemwiseType<1, 1>) + .set_attr("FInferType", PoolingType) .set_attr("FInferShape", PoolingShape) .set_attr("FCompute", PoolingCompute) .set_attr("FComputeEx", PoolingCompute_CPU) From 70b039931f64a5a9d8d21ab40c953f566642d4dd Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 1 Dec 2017 19:21:12 +0000 Subject: [PATCH 120/264] Use a non-MKLDNN op for testing. --- tests/python/unittest/test_executor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py index e3d977df65de..04a0ef3f8600 100644 --- a/tests/python/unittest/test_executor.py +++ b/tests/python/unittest/test_executor.py @@ -136,22 +136,20 @@ def test_dot(): def test_reshape(): x = mx.sym.Variable('x') - y = mx.sym.FullyConnected(x, num_hidden=4) + y = mx.sym.Dropout(x, p=0.2) exe = y.simple_bind(mx.cpu(), x=(5,4), grad_req='null') exe.arg_arrays[0][:] = 1 - exe.arg_arrays[1][:] = mx.nd.ones((4,4)) - exe.arg_arrays[2][:] = 0 new_exe = exe.reshape(x=(3,4)) new_exe.forward(is_train=False) # test sub exec forward - assert np.all(new_exe.outputs[0].asnumpy() == 4) + assert np.all(new_exe.outputs[0].asnumpy() == 1) # test shared memory - assert np.all(exe.outputs[0].asnumpy()[:3] == 4) + assert np.all(exe.outputs[0].asnumpy()[:3] == 1) # test base exec forward exe.forward(is_train=False) - assert np.all(exe.outputs[0].asnumpy() == 4) + assert np.all(exe.outputs[0].asnumpy() == 1) if __name__ == "__main__": test_bind(disable_bulk_exec=False) From e785e5d52f8e3f60d4d9eeb903a6e7eec3d0920c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 1 Dec 2017 22:59:14 +0000 Subject: [PATCH 121/264] Allow to share arguments and their gradients between executors. --- src/executor/graph_executor.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index bf620c832604..ca5da2ea8565 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -54,6 +54,10 @@ GraphExecutor::~GraphExecutor() { } } +inline bool SharableStorage(NDArrayStorageType stype) { + return stype == kDefaultStorage || stype == kMKLDNNStorage; +} + inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape, const Context &ctx, const int dtype) { // NDArray with default storage @@ -698,7 +702,7 @@ NDArray ReshapeOrCreate(const std::string& name, const Context& ctx, std::unordered_map* shared_buffer, bool enable_row_sparse_sharing) { - bool stype_shareable = dest_arg_stype == kDefaultStorage; + bool stype_shareable = SharableStorage(dest_arg_stype); if (enable_row_sparse_sharing) { stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage; } @@ -798,7 +802,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name); auto arg_nd_stype = in_arg_nd.storage_type(); // for model parameter, both default storage and row_sparse storage can be shared - bool shareable_arg_stype = inferred_stype == kDefaultStorage || + bool shareable_arg_stype = SharableStorage(inferred_stype) || inferred_stype == kRowSparseStorage; // try to reuse memory from shared_exec CHECK(shareable_arg_stype) << "Inferred storage type " @@ -832,8 +836,8 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx, auto grad_oid = grad_store_.size() + num_forward_outputs_; auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]); auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid]; - if (nullptr != shared_exec && grad_stype == kDefaultStorage && - shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) { + if (nullptr != shared_exec && SharableStorage(grad_stype) && + shared_exec->arg_grad_map().at(arg_name).storage_type() == grad_stype) { // try to reuse memory from shared_exec arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name)); } else { From 20f1e6a8e6eef8649cf58f424cc9ba8f95861276 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 1 Dec 2017 23:32:57 +0000 Subject: [PATCH 122/264] Avoid using MKLDNN pooling when it's not supported. --- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 6 ++++++ src/operator/nn/pooling.cc | 16 +++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index f3f4c68f13e6..f55b6628a4a2 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -30,6 +30,12 @@ namespace mxnet { namespace op { +static inline bool SupportMKLDNNPooling(const PoolingParam ¶m) { + return param.kernel.ndim() == 2 + && (param.pool_type == pool_enum::kMaxPooling + || param.pool_type == pool_enum::kAvgPooling); +} + static inline algorithm GetMKLDNNPoolAlgo(const PoolingParam ¶m) { switch (param.pool_type) { case pool_enum::kMaxPooling: diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index f31bf0cbf6a3..b91a270921c3 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -69,7 +69,7 @@ static bool PoolingType(const nnvm::NodeAttrs& attrs, out_attrs->at(0) = in_attrs->at(0); #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - if (MKLDNNRequireWorkspace(param)) { + if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) { CHECK_GT(out_attrs->size(), 1U); out_attrs->at(1) = mshadow::kFloat32; } @@ -113,7 +113,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->clear(); out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param_)) + if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) out_shape->push_back(oshape); // for workspace #endif } else if (param_.kernel.ndim() == 2) { @@ -152,7 +152,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->clear(); out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param_)) + if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) out_shape->push_back(oshape); // for workspace #endif } else if (param_.kernel.ndim() == 3) { @@ -198,7 +198,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->clear(); out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param_)) + if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) out_shape->push_back(oshape); // for workspace #endif } @@ -291,7 +291,8 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; @@ -312,7 +313,8 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; @@ -372,7 +374,7 @@ height, width)*. .set_num_outputs([](const NodeAttrs& attrs) { #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - return MKLDNNRequireWorkspace(param) ? 2 : 1; + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; #else return 1; #endif From 23943f27877b30f7c3ff8eae4db71f44e329f2bd Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 2 Dec 2017 00:09:54 +0000 Subject: [PATCH 123/264] Support MKLDNN properly. --- src/ndarray/ndarray.cc | 2 +- src/operator/nn/mkldnn/mkldnn_base-inl.h | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index d81866e4f2f1..5b0c5b2f4d65 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -52,7 +52,7 @@ static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dt #if MXNET_USE_MKLDNN == 1 // We can't always generate a MKLDNN storage. If MKLDNN can't support the data type, // we'll have to fall back to the default storage. - if (stype == kMKLDNNStorage && !SupportMKLDNN(dtype, shape)) + if (stype == kMKLDNNStorage && !SupportMKLDNNArray(dtype, shape)) return kDefaultStorage; else #endif diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 2ce19a451501..5af9eb160210 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -88,9 +88,17 @@ struct data_type_enum { enum { type = mkldnn::memory::data_type::u8 }; }; +static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { + int ndim = shape.ndim(); + bool support = ndim == 1 || ndim == 2 || ndim == 4; + support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 + || dtype == mshadow::kInt8 || dtype == mshadow::kUint8); + return support; +} + static inline bool SupportMKLDNN(int dtype, const TShape &shape) { int ndim = shape.ndim(); - return ndim == 1 || ndim == 2 || ndim == 4; + return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); } static inline bool SupportMKLDNN(const NDArray &input) { From 801bf666f9acfea3faadc18052e2a84accf5973e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 4 Dec 2017 23:21:02 +0000 Subject: [PATCH 124/264] Choose MKLDNN softmax more carefully. --- src/operator/nn/softmax.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 50c954656000..9a5b5e91f1b3 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -62,14 +62,14 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + // We only run MKLDNN op if it runs on CPU and the input data is MKLDNN + // format. + if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; - (*out_attrs)[0] = kDefaultStorage; + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = (*in_attrs)[0]; return true; } From f89ce8a431e9fc6093fc956052292f64d925b600 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 00:57:59 +0000 Subject: [PATCH 125/264] Fix a bug in MKLDNN pooling. --- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index f55b6628a4a2..48fe2b761f1c 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -158,9 +158,17 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, auto alg = GetMKLDNNPoolAlgo(param); + int kernel_h_, kernel_w_; + if (param.global_pool) { + kernel_h_ = data_md.data.dims[2]; + kernel_w_ = data_md.data.dims[3]; + } else { + kernel_h_ = param.kernel[0]; + kernel_w_ = param.kernel[1]; + } pooling_backward::desc desc( alg, diff_in_md, diff_md, {(int)param.stride[0], (int)param.stride[1]}, - {(int)param.kernel[0], (int)param.kernel[1]}, {(int)param.pad[0], (int)param.pad[1]}, + {kernel_h_, kernel_w_}, {(int)param.pad[0], (int)param.pad[1]}, {(int)param.pad[0], (int)param.pad[1]}, padding_kind::zero); pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd); From 1d749f092b2e1fd950d28bc10139f1693e37f1d9 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 00:59:05 +0000 Subject: [PATCH 126/264] Fall back if MKLDNN pooling isn't supported. --- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 14 ++++++++ src/operator/nn/pooling.cc | 38 ++++++++++++++------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 48fe2b761f1c..309cd510a4a1 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -36,6 +36,20 @@ static inline bool SupportMKLDNNPooling(const PoolingParam ¶m) { || param.pool_type == pool_enum::kAvgPooling); } +static inline bool SupportMKLDNNPooling(const PoolingParam ¶m, + const TShape &dshape) { + auto ret = SupportMKLDNNPooling(param); + if (!ret) + return false; + if (param.pooling_convention == pool_enum::kValid) + return true; + if ((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0 + && (dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0) + return true; + else + return false; +} + static inline algorithm GetMKLDNNPoolAlgo(const PoolingParam ¶m) { switch (param.pool_type) { case pool_enum::kMaxPooling: diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index b91a270921c3..bad9908a02d4 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -216,17 +216,18 @@ void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, CHECK_GT(outputs.size(), 1U); workspace = &outputs[1]; } - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0], - workspace); - return; + if (SupportMKLDNN(inputs[0]) + && SupportMKLDNNPooling(param, inputs[0].shape())) { + MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0], + workspace); + return; } #endif // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); - std::vector out_blobs(outputs.size()); + // We know pooling has only one output. + std::vector out_blobs(1); for (size_t i = 0; i < out_blobs.size(); i++) out_blobs[i] = outputs[i].data(); PoolingCompute(attrs, ctx, in_blobs, req, out_blobs); @@ -254,16 +255,27 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, in_data = &inputs[1]; } const NDArray &in_grad = outputs[0]; - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNPooling_Backward(ctx, param, out_grad, *in_data, workspace, - req[0], in_grad); - return; + if (SupportMKLDNN(inputs[0]) + && SupportMKLDNNPooling(param, inputs[0].shape())) { + MKLDNNPooling_Backward(ctx, param, out_grad, *in_data, workspace, + req[0], in_grad); + return; } #endif // TODO I need to convert format. - std::vector in_blobs(inputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); + std::vector in_blobs(3); + // In this case, there isn't workspace in the input arrays. + if (inputs.size() == 3) { + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + } + else { + // There is workspace among the input arrays. One for out_grad and one for + // input. + in_blobs[0] = inputs[0].data(); // out grad + in_blobs[1] = inputs[2].data(); // in data + in_blobs[2] = inputs[3].data(); // out data + } std::vector out_blobs(outputs.size()); for (size_t i = 0; i < out_blobs.size(); i++) out_blobs[i] = outputs[i].data(); From 8f7acab2d3427e11432b4e03a3307c0d69ab3109 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 01:37:22 +0000 Subject: [PATCH 127/264] Fix a bug in Slice of NDArray. --- src/ndarray/ndarray.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 5b0c5b2f4d65..f9bd4e342857 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -309,9 +309,12 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); } - + else { + ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; + } }, ctx(), {this->var()}, {ret.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); + ret.WaitToRead(); return ret; } #endif From 179ac61abbe0110288e9f486e9094f56dad65baa Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 01:39:13 +0000 Subject: [PATCH 128/264] Use int32 for workspace memory. --- src/operator/nn/pooling.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index bad9908a02d4..c17a879df453 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -71,7 +71,7 @@ static bool PoolingType(const nnvm::NodeAttrs& attrs, const PoolingParam ¶m = nnvm::get(attrs.parsed); if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) { CHECK_GT(out_attrs->size(), 1U); - out_attrs->at(1) = mshadow::kFloat32; + out_attrs->at(1) = mshadow::kInt32; } #endif return true; From 8e6abf911073aec51dbc172d29838707c979b06d Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 04:38:33 +0000 Subject: [PATCH 129/264] Exclude MKLDNN act with tanh. --- src/operator/nn/activation.cc | 7 +++---- src/operator/nn/mkldnn/mkldnn_act-inl.h | 8 ++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index b9411e3b244d..5374495151ff 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -64,8 +64,7 @@ static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - _ActivationCompute(param, ctx, inputs[0].data(), req[0], - outputs[0].data()); + _ActivationCompute(param, ctx, inputs[0].data(), req[0], outputs[0].data()); } void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, @@ -99,7 +98,7 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -122,7 +121,7 @@ inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index b368913a61a3..eebd65390836 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -45,6 +45,14 @@ namespace mxnet { namespace op { +static inline bool SupportMKLDNNAct(const ActivationParam& param) { + // We don't include tanh for now. It seems MKLDNN tanh has some precision + // problems. + return param.act_type == activation::kReLU + || param.act_type == activation::kSigmoid + || param.act_type == activation::kSoftReLU; +} + static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { switch (param.act_type) { case activation::kReLU: From e8cc7e06169c244a8b6c19a2ac056cf999bb41c4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 21:19:53 +0000 Subject: [PATCH 130/264] Have two Reshape functions in NDArray. --- include/mxnet/ndarray.h | 6 +++ src/ndarray/ndarray.cc | 46 ++++++++++++++++--- .../nn/mkldnn/mkldnn_fully_connected.cc | 19 ++++---- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 8d51858f774f..d846e4cba38c 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -572,6 +572,12 @@ class NDArray { void CopyFrom(const mkldnn::memory &mem); std::shared_ptr CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); + + /* + * This function is used inside operators to reshape an array. + * It's used by FullyConnected right now. + */ + NDArray ReshapeMKLDNN(const TShape &shape) const; #endif /*! diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f9bd4e342857..6f2d896614c5 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -222,7 +222,8 @@ static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) } } -static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { +static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, + bool submit_now = true) { auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); if (format == mem->get_primitive_desc().desc().data.format) return mem; @@ -239,13 +240,34 @@ static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem) { pd.get_engine()))); MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterMem(mem); stream.RegisterMem(def_mem); stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); - // TODO do I have to submit it here? - stream.Submit(); + if (submit_now) + stream.Submit(); return def_mem; } +NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { + CHECK(!is_none()) << "NDArray is not initialized"; + CHECK_GE(shape_.Size(), shape.Size()) + << "NDArray.Reshape: target shape size is larger current shape"; + if (storage_type() == kDefaultStorage) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + return ret; + } else if (storage_type() == kMKLDNNStorage) { + NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); + CHECK(ptr_->Mkl_mem_ != nullptr); + // We shouldn't submit the reorder primitive here because submit will + // be called in operators. + ret.ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_, false); + return ret; + } + LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; + return NDArray(); +} + #endif NDArray NDArray::Reshape(const TShape &shape) const { @@ -258,10 +280,20 @@ NDArray NDArray::Reshape(const TShape &shape) const { return ret; #if MXNET_USE_MKLDNN == 1 } else if (storage_type() == kMKLDNNStorage) { - NDArray ret = this->Detach(); - ret.shape_ = shape; - if (ret.ptr_->Mkl_mem_) - ret.ptr_->Mkl_mem_ = Reorder2Default(ret.ptr_->Mkl_mem_); + NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); + // We need to convert the MKL memory to the default layout. + Engine::Get()->PushSync([&](RunContext ctx) { + if (this->ptr_->Mkl_mem_) { + auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); + if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { + ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); + } + else + ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; + } + }, ctx(), {this->var()}, {ret.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); + ret.WaitToRead(); return ret; #endif } diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 9ed1f0da08a8..17f504b9062e 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -92,8 +92,8 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, NDArray data = in_data[fullc::kData]; auto out_md = GetMemDesc(out_data[fullc::kOut]); if (data.shape().ndim() != 2 && !param.flatten) { - data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), - ishape[ishape.ndim()-1])); + data = data.ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); // TODO this can potentially be a problem when casting the type. mkldnn::memory::dims out_dims{(int) oshape.ProdShape(0, oshape.ndim()-1), (int) oshape[ishape.ndim()-1]}; @@ -101,7 +101,7 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, mkldnn::memory::format::any); } else if (data.shape().ndim() != 2) { - data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + data = data.ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); // TODO this can potentially be a problem when casting the type. mkldnn::memory::dims out_dims{(int) oshape[0], (int) oshape.ProdShape(1, oshape.ndim())}; out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), @@ -137,15 +137,18 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, NDArray weight = inputs[fullc::kWeight + 1]; NDArray data = inputs[fullc::kData + 1]; if (data.shape().ndim() != 2 && !param.flatten) - data = data.Reshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), - ishape[ishape.ndim()-1])); + data = data.ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); else if (data.shape().ndim() != 2) - data = data.Reshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + data = data.ReshapeMKLDNN(Shape2(ishape[0], + ishape.ProdShape(1, ishape.ndim()))); NDArray out_grad = inputs[fullc::kOut]; if (out_grad.shape().ndim() != 2 && !param.flatten) - out_grad = out_grad.Reshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1])); + out_grad = out_grad.ReshapeMKLDNN(Shape2(oshape.ProdShape(0, oshape.ndim()-1), + oshape[oshape.ndim()-1])); else if (out_grad.shape().ndim() != 2) - out_grad = out_grad.Reshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim()))); + out_grad = out_grad.ReshapeMKLDNN(Shape2(oshape[0], + oshape.ProdShape(1, oshape.ndim()))); mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad)); From 801bebba525718add369b20f730f7306121442c4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 22:08:57 +0000 Subject: [PATCH 131/264] Move concat to nn/ --- src/operator/{ => nn}/concat-inl.h | 0 src/operator/{ => nn}/concat.cc | 0 src/operator/{ => nn}/concat.cu | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/operator/{ => nn}/concat-inl.h (100%) rename src/operator/{ => nn}/concat.cc (100%) rename src/operator/{ => nn}/concat.cu (100%) diff --git a/src/operator/concat-inl.h b/src/operator/nn/concat-inl.h similarity index 100% rename from src/operator/concat-inl.h rename to src/operator/nn/concat-inl.h diff --git a/src/operator/concat.cc b/src/operator/nn/concat.cc similarity index 100% rename from src/operator/concat.cc rename to src/operator/nn/concat.cc diff --git a/src/operator/concat.cu b/src/operator/nn/concat.cu similarity index 100% rename from src/operator/concat.cu rename to src/operator/nn/concat.cu From bb1570ae325201ec9451c322977c323caa1f788b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 23:03:48 +0000 Subject: [PATCH 132/264] Use NNVM interface for concat. --- src/operator/nn/concat-inl.h | 180 ++++++++--------------------------- src/operator/nn/concat.cc | 146 +++++++++++++++++++++------- src/operator/nn/concat.cu | 14 ++- 3 files changed, 154 insertions(+), 186 deletions(-) diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h index fdbe33072a0f..411ad23eea8b 100644 --- a/src/operator/nn/concat-inl.h +++ b/src/operator/nn/concat-inl.h @@ -33,8 +33,8 @@ #include #include #include -#include "./operator_common.h" -#include "./channel_op_common.h" +#include "../operator_common.h" +#include "../channel_op_common.h" namespace mxnet { namespace op { @@ -56,16 +56,17 @@ struct ConcatParam : public dmlc::Parameter { }; // struct ConcatParam template -class ConcatOp : public Operator { +class ConcatOp { public: - explicit ConcatOp(ConcatParam param) - : size_(param.num_args), dimension_(param.dim) {} + void Init(const ConcatParam ¶m) { + this->size_ = param.num_args; + this->dimension_ = param.dim; + } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(static_cast(in_data.size()), size_); @@ -92,13 +93,10 @@ class ConcatOp : public Operator { Concatenate(data, &out, 1, req[concat_enum::kOut]); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); @@ -130,133 +128,31 @@ class ConcatOp : public Operator { }; // class ConcatOp template -Operator *CreateOp(ConcatParam param, int dtype, std::vector *in_shape); - -#if DMLC_USE_CXX11 -class ConcatProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - std::vector ListArguments() const override { - std::vector ret; - for (int i = 0; i < param_.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - TShape dshape; - index_t size = 0; - bool has_zero = false; - for (int i = 0; i < param_.num_args; ++i) { - TShape tmp = (*in_shape)[i]; - if (tmp.ndim()) { - CHECK_LT(static_cast(param_.dim), tmp.ndim()) - << "concat dim " << param_.dim << " out of range of input shape " << tmp; - has_zero = tmp[param_.dim] == 0 || has_zero; - size += tmp[param_.dim]; - tmp[param_.dim] = 0; - shape_assign(&dshape, tmp); - } - } - - TShape tmp = (*out_shape)[0]; - if (tmp.ndim()) { - CHECK_LT(static_cast(param_.dim), tmp.ndim()) - << "concat dim " << param_.dim << " out of range of input shape " << tmp; - tmp[param_.dim] = 0; - shape_assign(&dshape, tmp); - } - - if (dshape.ndim() == 0) return false; - - for (int i = 0; i < param_.num_args; ++i) { - CHECK(shape_assign(&(*in_shape)[i], dshape)) - << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; - } - - if (!has_zero) dshape[param_.dim] = size; - CHECK(shape_assign(&(*out_shape)[0], dshape)) - << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; - - return dshape.Size() != 0; - } +void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConcatParam& param = nnvm::get(attrs.parsed); + MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, { + ConcatOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - int dtype = -1; - - for (size_t i = 0; i < in_type->size(); ++i) { - if (dtype == -1) { - dtype = in_type->at(i); - } else { - CHECK(in_type->at(i) == dtype || - in_type->at(i) == -1) << - "Non-uniform data type in Concat"; - } - } - - if (dtype == -1) { - LOG(FATAL) << "Not enough information to infer type in Concat."; - return false; - } - - size_t nin = this->ListArguments().size(); - in_type->clear(); - for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); - - size_t naux = this->ListAuxiliaryStates().size(); - aux_type->clear(); - for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype); - - size_t nout = this->ListOutputs().size(); - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ConcatProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Concat"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return out_grad; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConcatParam& param = nnvm::get(attrs.parsed); + MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, { + ConcatOp op; + op.Init(param); + op.Backward(ctx, inputs, req, outputs); + }); +} - private: - ConcatParam param_; -}; // class ConcatProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index 4d3c2fa1661f..61b9f517eb56 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -25,51 +25,97 @@ */ #include "./concat-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_concat-inl.h" -#endif // MXNET_USE_MKL2017 namespace mxnet { namespace op { -template<> -Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - // MKL supports 4D input tensors only for concat operation - // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h - // hence MKL supports 2D/3D/4D input tensors for concat operation - size_t dims = (*in_shape)[0].ndim(); - bool supportedDim = (dims >= 2 && dims <= 4); - if ((1 == param.dim) && supportedDim && - (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLConcatOp(param); - case mshadow::kFloat64: - return new MKLConcatOp(param); - default: - break; + +static bool ConcatShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + const ConcatParam& param_ = nnvm::get(attrs.parsed); + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + TShape dshape; + index_t size = 0; + bool has_zero = false; + for (int i = 0; i < param_.num_args; ++i) { + TShape tmp = (*in_shape)[i]; + if (tmp.ndim()) { + CHECK_LT(static_cast(param_.dim), tmp.ndim()) + << "concat dim " << param_.dim << " out of range of input shape " << tmp; + has_zero = tmp[param_.dim] == 0 || has_zero; + size += tmp[param_.dim]; + tmp[param_.dim] = 0; + shape_assign(&dshape, tmp); } } - if (enableMKLWarnGenerated()) - LOG(INFO) << MKLConcatOp::getName() << " Skip MKL optimization"; -#endif - MSHADOW_TYPE_SWITCH(dtype, DType, { - op = new ConcatOp(param); - }); - return op; + + TShape tmp = (*out_shape)[0]; + if (tmp.ndim()) { + CHECK_LT(static_cast(param_.dim), tmp.ndim()) + << "concat dim " << param_.dim << " out of range of input shape " << tmp; + tmp[param_.dim] = 0; + shape_assign(&dshape, tmp); + } + + if (dshape.ndim() == 0) return false; + + for (int i = 0; i < param_.num_args; ++i) { + CHECK(shape_assign(&(*in_shape)[i], dshape)) + << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; + } + + if (!has_zero) dshape[param_.dim] = size; + CHECK(shape_assign(&(*out_shape)[0], dshape)) + << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; + + return dshape.Size() != 0; } -Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape); +static bool ConcatType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + const ConcatParam& param_ = nnvm::get(attrs.parsed); + int dtype = -1; + + for (size_t i = 0; i < in_type->size(); ++i) { + if (dtype == -1) { + dtype = in_type->at(i); + } else { + CHECK(in_type->at(i) == dtype || + in_type->at(i) == -1) << + "Non-uniform data type in Concat"; + } + } + + if (dtype == -1) { + LOG(FATAL) << "Not enough information to infer type in Concat."; + return false; + } + + size_t nin = param_.num_args; + in_type->clear(); + for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); + + out_type->clear(); + out_type->push_back(dtype); + + return true; } +struct ConcatGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const ConcatParam& param = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(ConcatParam); -MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp) +NNVM_REGISTER_OP(Concat) .describe(R"code(Joins input arrays along a given axis. .. note:: `Concat` is deprecated. Use `concat` instead. @@ -102,11 +148,39 @@ Example:: [ 5., 5., 8., 8.]] )code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + return params.num_args; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + std::vector ret; + for (int i = 0; i < params.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); + } + return ret; +}) +.set_attr("FInferShape", ConcatShape) +.set_attr("FInferType", ConcatType) +.set_attr("FCompute", ConcatCompute) +.set_attr("FGradient", ConcatGrad{"_backward_Concat"}) +.set_attr("key_var_num_args", "num_args") .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") -.add_arguments(ConcatParam::__FIELDS__()) -.set_key_var_num_args("num_args"); +.add_arguments(ConcatParam::__FIELDS__()); NNVM_REGISTER_OP(Concat).add_alias("concat"); +NNVM_REGISTER_OP(_backward_Concat) +.set_num_outputs([](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + return params.num_args; +}) +.set_attr_parser(ParamParser) +.set_attr("TIsBackward", true) +.set_attr("FCompute", ConcatGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/concat.cu b/src/operator/nn/concat.cu index 394fa736ee84..f6bf5ece5c78 100644 --- a/src/operator/nn/concat.cu +++ b/src/operator/nn/concat.cu @@ -28,14 +28,12 @@ namespace mxnet { namespace op { -template<> -Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { - Operator *op = NULL; - MSHADOW_TYPE_SWITCH(dtype, DType, { - op = new ConcatOp(param); - }); - return op; -} + +NNVM_REGISTER_OP(Concat) +.set_attr("FCompute", ConcatCompute); + +NNVM_REGISTER_OP(_backward_Concat) +.set_attr("FCompute", ConcatGradCompute); } // namespace op } // namespace mxnet From d572bc985300dec116d0a1ac8fbd3c3a1f24dfe2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 5 Dec 2017 23:05:27 +0000 Subject: [PATCH 133/264] Move lrn to nn/. --- src/operator/{ => nn}/lrn-inl.h | 0 src/operator/{ => nn}/lrn.cc | 0 src/operator/{ => nn}/lrn.cu | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/operator/{ => nn}/lrn-inl.h (100%) rename src/operator/{ => nn}/lrn.cc (100%) rename src/operator/{ => nn}/lrn.cu (100%) diff --git a/src/operator/lrn-inl.h b/src/operator/nn/lrn-inl.h similarity index 100% rename from src/operator/lrn-inl.h rename to src/operator/nn/lrn-inl.h diff --git a/src/operator/lrn.cc b/src/operator/nn/lrn.cc similarity index 100% rename from src/operator/lrn.cc rename to src/operator/nn/lrn.cc diff --git a/src/operator/lrn.cu b/src/operator/nn/lrn.cu similarity index 100% rename from src/operator/lrn.cu rename to src/operator/nn/lrn.cu From 539e3e7c7e458eef9235ef4177ba4c8bf6c0b987 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 6 Dec 2017 00:14:37 +0000 Subject: [PATCH 134/264] Use NNVM interface for LRN. --- src/operator/nn/lrn-inl.h | 204 +++++++++++--------------------------- src/operator/nn/lrn.cc | 83 ++++++++++++---- src/operator/nn/lrn.cu | 23 ++--- 3 files changed, 128 insertions(+), 182 deletions(-) diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h index adfe4676702d..2dfecea0bde1 100644 --- a/src/operator/nn/lrn-inl.h +++ b/src/operator/nn/lrn-inl.h @@ -32,8 +32,8 @@ #include #include #include -#include "./operator_common.h" -#include "./mshadow_op.h" +#include "../operator_common.h" +#include "../mshadow_op.h" namespace mxnet { namespace op { @@ -61,155 +61,67 @@ struct LRNParam : public dmlc::Parameter { }; // struct LRNParam template -class LocalResponseNormOp : public Operator { - public: - explicit LocalResponseNormOp(LRNParam param) { - param_ = param; - } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - // TODO(xxx): Test with gradient chceker - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - // CHECK_EQ(req.size(), 2); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor data = in_data[lrn_enum::kData].get(s); - Tensor out = out_data[lrn_enum::kOut].get(s); - Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); - tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; - Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor grad = out_grad[lrn_enum::kOut].get(s); - Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); - Tensor data = in_data[lrn_enum::kData].get(s); - Tensor grad_in = in_grad[lrn_enum::kData].get(s); - grad_in = grad * F(tmp_norm, -param_.beta); - grad_in += (- 2.0f * param_.beta * salpha) * - chpool(grad * data * - F(tmp_norm, -param_.beta - 1.0f), - param_.nsize) * data; - } - - private: - LRNParam param_; -}; // class LocalResponseNormOp +void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + const LRNParam& param_ = nnvm::get(attrs.parsed); + // TODO(xxx): Test with gradient chceker + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + // CHECK_EQ(req.size(), 2); + CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor data = in_data[lrn_enum::kData].get(s); + Tensor out = out_data[lrn_enum::kOut].get(s); + Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); + tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; + Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); +} template -Operator *CreateOp(LRNParam param, int dtype); - -#if DMLC_USE_CXX11 -class LocalResponseNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - int n_out = this->ListOutputs().size(); - out_type->clear(); - for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new LocalResponseNormProp(); - ptr->param_ = param_; - return ptr; - } +void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const TBlob &out_grad, const TBlob &in_data, + const TBlob &out_norm, const OpReqType &req, + const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + const LRNParam& param_ = nnvm::get(attrs.parsed); + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor grad = out_grad.get(s); + Tensor tmp_norm = out_norm.get(s); + Tensor data = in_data.get(s); + Tensor grad_in = in_grad.get(s); + grad_in = grad * F(tmp_norm, -param_.beta); + grad_in += (- 2.0f * param_.beta * salpha) * + chpool(grad * data * + F(tmp_norm, -param_.beta - 1.0f), + param_.nsize) * data; +} - std::string TypeString() const override { - return "LRN"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return { - out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], - out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut] - }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 2; - } - - std::vector ListArguments() const override { - return {"data"}; - } - - std::vector ListOutputs() const override { - return {"output", "tmp_norm"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } +template +void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LRNForward(attrs, ctx, inputs, req, outputs); +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LRNBackward(attrs, ctx, inputs[0], // out_grad + inputs[1], // in_data + inputs[2], // out_norm + req[lrn_enum::kData], outputs[lrn_enum::kData]); +} - private: - LRNParam param_; -}; // LocalResponseNormProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_LRN_INL_H_ diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 9b3afd80cd18..21bf457512f2 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -28,33 +28,61 @@ #if MXNET_USE_CUDNN == 1 #include "./cudnn_lrn-inl.h" #endif -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_lrn-inl.h" -#endif namespace mxnet { namespace op { -template<> -Operator* CreateOp(LRNParam param, int dtype) { -#if MXNET_USE_MKL2017 == 1 - return new MKLLRNOp(param); -#endif - return new LocalResponseNormOp(param); + +static bool LRNShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; } -// DO_BIND_DISPATCH comes from operator_common.h -Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +static inline std::vector ListArguments() { + return {"data"}; } +static bool LRNType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + int n_out = 2; + out_type->clear(); + for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); + return true; +} + +struct LRNGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[0]); // out_grad + heads.push_back(n->inputs[lrn_enum::kData]); + heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(LRNParam); -MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp) -.add_argument("data", "NDArray-or-Symbol", "Input data.") -.add_arguments(LRNParam::__FIELDS__()) +NNVM_REGISTER_OP(LRN) .describe(R"code(Applies local response normalization to the input. The local response normalization layer performs "lateral inhibition" by normalizing @@ -70,7 +98,24 @@ activity :math:`b_{x,y}^{i}` is given by the expression: where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total number of kernels in the layer. -)code" ADD_FILELINE); +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { return 1; }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", LRNShape) +.set_attr("FInferType", LRNType) +.set_attr("FCompute", LRNCompute) +.set_attr("FGradient", LRNGrad{"_backward_LRN"}) +.add_argument("data", "NDArray-or-Symbol", "Input data to LRN") +.add_arguments(LRNParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_LRN) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("TIsBackward", true) +.set_attr("FCompute", LRNGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/lrn.cu b/src/operator/nn/lrn.cu index ba872f1d26d0..83dd1d0322ea 100644 --- a/src/operator/nn/lrn.cu +++ b/src/operator/nn/lrn.cu @@ -31,23 +31,12 @@ namespace mxnet { namespace op { -template<> -Operator* CreateOp(LRNParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_CUDNN == 1 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNLocalResponseNormOp(param); - }) -#else -#if CUDA_VERSION == 7000 - LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled." - << "Please upgrade CUDA to 7.5+ or use CUDNN"; -#else - op = new LocalResponseNormOp(param); -#endif // CUDA_VERSION -#endif // MXNET_USE_CUDNN - return op; -} + +NNVM_REGISTER_OP(LRN) +.set_attr("FCompute", LRNCompute); + +NNVM_REGISTER_OP(_backward_LRN) +.set_attr("FCompute", LRNGradCompute); } // namespace op } // namespace mxnet From 5439b3112bd3ff5036364a8be459ebd630efd4fb Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 6 Dec 2017 22:57:09 +0000 Subject: [PATCH 135/264] Copy data for NDArray with diff shapes. --- src/ndarray/ndarray.cc | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 6f2d896614c5..f826c54dcedf 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -594,9 +594,31 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { if (ptr_->Mkl_mem_.get() == &mem) return; - // TODO if the shape mismatches. + if (mem.get_primitive_desc().get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return; + } + + MKLDNNStream &stream = MKLDNNStream::Instance(); ptr_->SetMKLMem(shape_, dtype_); - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + auto from_desc = mem.get_primitive_desc().desc(); + auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); + // It's possible that the memory and the NDArray don't have the same shape. + if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { + // In this case, we can simply create a new MKLDNN memory for the required + // shape. + // TODO let's just hope it's the default format for now. + CHECK_EQ(GetDefaultFormat(from_desc), from_desc.data.format); + mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims); + mkldnn::memory::desc data_md(dims, static_cast(this_desc.data.data_type), + static_cast(GetDefaultFormat(this_desc))); + mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); + stream.RegisterMem(tmp_mem); + stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); + } + else + stream.RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); } std::shared_ptr NDArray::CreateMKLDNNData( From 6d07be9445321f3d4f7e3451043ddc1b02fe351f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 6 Dec 2017 22:58:59 +0000 Subject: [PATCH 136/264] Add MKLDNN copy. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 3 +- src/operator/nn/mkldnn/mkldnn_copy.cc | 58 +++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_ops-inl.h | 5 ++ .../tensor/elemwise_unary_op_basic.cc | 56 ++++++++++++++++-- 4 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_copy.cc diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 5af9eb160210..5c04071a7783 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -169,7 +169,8 @@ class MKLDNNStream { void RegisterMem(mkldnn_mem_const_ptr mem) { mem_holder.push_back(mem); } void Submit() { - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + if (!net.empty()) + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); net.clear(); mem_holder.clear(); } diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc new file mode 100644 index 000000000000..6f1975dd279b --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_softmax.cc + * \brief + * \author Da Zheng +*/ + +#include "../softmax-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + auto in_mem = in_data.GetMKLDNNData(); + if (req == kAddTo) { + // We should try and force the output memory has the same format + // as the input memory. If not, we'll have to reorder memory. + auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); + if (out_mem == nullptr) + out_mem = out_data.GetMKLDNNData(); + mkldnn_mem_ptr sum_res( + new mkldnn::memory(out_mem->get_primitive_desc())); + MKLDNNStream::Instance().RegisterMem(sum_res); + Sum(*in_mem, *out_mem, *sum_res); + const_cast(out_data).CopyFrom(*sum_res); + } + else { + const_cast(out_data).CopyFrom(*in_mem); + } + MKLDNNStream::Instance().Submit(); +} + +} +} +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 2d5513cc9854..ffeaf67fa74a 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -71,6 +71,11 @@ void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const OpReqType &req, const NDArray &out_data); +/* For copy */ +void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); + } } #endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 916c385467cf..203673a4b247 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -24,6 +24,7 @@ #include #include "elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -108,12 +109,59 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid, unary_bwd); // copy +static void CopyEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); +#if MXNET_USE_MKLDNN == 1 + if (in_stype == kMKLDNNStorage) { + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + return; + } + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + else if (inputs[0].storage_type() == kDefaultStorage) { + std::vector in_blobs(1); + std::vector out_blobs(1); + in_blobs[0] = inputs[0].data(); + out_blobs[0] = outputs[0].data(); + UnaryOp::IdentityCompute(attrs, ctx, in_blobs, req, out_blobs); + return; + } +#endif + UnaryOp::IdentityComputeEx(attrs, ctx, inputs, req, outputs); +} + +static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); +#if MXNET_USE_MKLDNN == 1 + if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { + out_attrs->at(0) = kMKLDNNStorage; + *dispatch_mode = DispatchMode::kFComputeEx; + return true; + } else +#endif + return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +} + MXNET_OPERATOR_REGISTER_UNARY(_copy) .MXNET_DESCRIBE("Returns a copy of the input.") .add_alias("identity") -.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) +.set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) +.set_attr("FComputeEx", CopyEx) .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; @@ -128,9 +176,9 @@ NNVM_REGISTER_OP(_backward_copy) [](const NodeAttrs& attrs){ return std::vector >{{0, 0}}; }) -.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) +.set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) +.set_attr("FComputeEx", CopyEx) .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; From ea576dda66b2908e89874b3c530f52bb634e7860 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 7 Dec 2017 01:46:41 +0000 Subject: [PATCH 137/264] Add MKLDNN version of elemwise_add. --- .../tensor/elemwise_binary_op_basic.cc | 103 +++++++++++++++++- 1 file changed, 97 insertions(+), 6 deletions(-) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index d7e5e04ce87a..4d51d5b0e0b6 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -24,11 +24,69 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { -MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus) +static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].storage_type() == kMKLDNNStorage + || inputs[1].storage_type() == kMKLDNNStorage) { + MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); + return; + } + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + else if (inputs[0].storage_type() == kDefaultStorage + || inputs[1].storage_type() == kDefaultStorage) { + std::vector in_blobs(2); + std::vector out_blobs(1); + in_blobs[0] = inputs[0].data(); + in_blobs[1] = inputs[1].data(); + out_blobs[0] = outputs[0].data(); + ElemwiseBinaryOp::Compute(attrs, ctx, in_blobs, + req, out_blobs); + return; + } +#endif + ElemwiseBinaryOp::ComputeEx(attrs, ctx, inputs, + req, outputs); +} + +static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2); + CHECK_EQ(out_attrs->size(), 1); +#if MXNET_USE_MKLDNN == 1 + if ((in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage) + && dev_mask == mshadow::cpu::kDevMask) { + out_attrs->at(0) = kMKLDNNStorage; + *dispatch_mode = DispatchMode::kFComputeEx; + return true; + } else +#endif + return ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +} + +MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) +.set_attr("FInferStorageType", ElemwiseAddStorageType) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) +.set_attr("FComputeEx", ElemwiseAddEx) +.set_attr("FResourceRequest", /* For Sparse CSR */ + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace};}) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add) .add_alias("_add").add_alias("_plus").add_alias("_Plus") .describe(R"code(Adds arguments element-wise. @@ -46,6 +104,42 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs // this must differ from elemwise_add to prevent add to optimization in forward pass. MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus); +static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 2U); +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].storage_type() == kMKLDNNStorage) { + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]); + } else +#endif + ElemwiseBinaryOp::BackwardUseNoneEx( + attrs, ctx, inputs, req, outputs); +} + +static inline bool _backward_ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 2); +#if MXNET_USE_MKLDNN == 1 + if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { + out_attrs->at(0) = kMKLDNNStorage; + out_attrs->at(1) = kMKLDNNStorage; + *dispatch_mode = DispatchMode::kFComputeEx; + return true; + } else +#endif + return ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +} + NNVM_REGISTER_OP(_backward_add) .set_num_inputs(1) .set_num_outputs(2) @@ -57,11 +151,8 @@ NNVM_REGISTER_OP(_backward_add) }) .set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< cpu, mshadow_op::identity, mshadow_op::identity>) -.set_attr("FComputeEx", - ElemwiseBinaryOp::BackwardUseNoneEx) -.set_attr("FInferStorageType", - ElemwiseStorageType<1, 2, true, true, true>); +.set_attr("FComputeEx", _backward_ElemwiseAddEx) +.set_attr("FInferStorageType", _backward_ElemwiseAddStorageType); MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub) From 94cbaae4756dc426fbf3e4f0090d2b7b5c71680c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 7 Dec 2017 02:11:51 +0000 Subject: [PATCH 138/264] Add MKLDNN version of Flatten. --- src/operator/tensor/matrix_op.cc | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 8f36e35d279f..5b889bda1f50 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -25,6 +25,7 @@ // this will be invoked by gcc and compile CPU version #include "./matrix_op-inl.h" #include "./elemwise_unary_op.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -122,6 +123,56 @@ If the argument `reverse` is set to 1, then the special values are inferred from .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.") .add_arguments(ReshapeParam::__FIELDS__()); +static void FlattenEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); +#if MXNET_USE_MKLDNN == 1 + if (in_stype == kMKLDNNStorage) { + NDArray data = inputs[0]; + if (data.shape().ndim() != 2) { + const TShape& oshape = outputs[0].shape(); + data = data.ReshapeMKLDNN(mshadow::Shape2(oshape[0], oshape[1])); + } + MKLDNNCopy(attrs, ctx, data, req[0], outputs[0]); + return; + } + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + else if (in_stype == kDefaultStorage) { + std::vector in_blobs(1); + std::vector out_blobs(1); + in_blobs[0] = inputs[0].data(); + out_blobs[0] = outputs[0].data(); + UnaryOp::IdentityCompute(attrs, ctx, in_blobs, req, out_blobs); + return; + } +#endif +} + +static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); +#if MXNET_USE_MKLDNN == 1 + if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { + out_attrs->at(0) = kMKLDNNStorage; + *dispatch_mode = DispatchMode::kFComputeEx; + return true; + } else +#endif + return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +} NNVM_REGISTER_OP(Flatten) .add_alias("flatten") @@ -152,8 +203,10 @@ Example:: .set_num_outputs(1) .set_attr("FInferShape", FlattenShape) .set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", FlattenStorageType) .set_attr("FGradient", ElemwiseGradUseNone{ "_backward_copy" }) .set_attr("FCompute", UnaryOp::IdentityCompute) +.set_attr("FComputeEx", FlattenEx) .set_attr("FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; From 13c01993445260c144cb8a0a3be015876d01dbf4 Mon Sep 17 00:00:00 2001 From: wentingj Date: Thu, 7 Dec 2017 16:52:00 +0800 Subject: [PATCH 139/264] add mkldnn surport for concat --- src/operator/nn/concat.cc | 101 +++++++++++++++++++++++- src/operator/nn/mkldnn/mkldnn_concat.cc | 85 ++++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_ops-inl.h | 8 ++ 3 files changed, 192 insertions(+), 2 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_concat.cc diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index 61b9f517eb56..d17bf8054238 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -25,6 +25,7 @@ */ #include "./concat-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -103,12 +104,104 @@ static bool ConcatType(const nnvm::NodeAttrs& attrs, return true; } +inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); + CHECK_EQ(out_attrs->size(), 1U); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[0] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +inline static bool backward_ConcatStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { +#if MXNET_USE_MKLDNN == 1 + CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& op_ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK(!inputs.empty()); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[0] == kNullOp) return; +#if MXNET_USE_MKLDNN == 1 + //MKLDNN support 2D and 4D concat + if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { + if(inputs[0].dtype() == mshadow::kFloat32) { + MKLDNNConcat_Forward(attrs, op_ctx, inputs, req, outputs); + } + } + else { + // TODO I need to convert format. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConcatCompute(attrs, op_ctx, in_blobs, req, out_blobs); + } +#endif +} + +static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, const std::vector& outputs) { +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { + if(inputs[0].dtype() == mshadow::kFloat32) { + MKLDNNConcat_Backward(attrs, ctx, inputs, req, outputs); + } + } + else { + // TODO I need to convert format. + std::vector in_blobs(1); + in_blobs[0] = inputs[0].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ConcatGradCompute(attrs, ctx, in_blobs, req, out_blobs); + } +#endif +} + struct ConcatGrad { const char *op_name; std::vector operator()(const nnvm::NodePtr& n, const std::vector& ograds) const { - const ConcatParam& param = nnvm::get(n->attrs.parsed); + CHECK_EQ(ograds.size(), 1); std::vector heads(ograds.begin(), ograds.end()); + for (size_t i = 0; i < n->inputs.size(); i++) { + heads.push_back(n->inputs[i]); + } return MakeGradNode(op_name, n, heads, n->attrs.dict); } }; @@ -165,7 +258,9 @@ Example:: }) .set_attr("FInferShape", ConcatShape) .set_attr("FInferType", ConcatType) +.set_attr("FInferStorageType", ConcatForwardInferStorageType) .set_attr("FCompute", ConcatCompute) +.set_attr("FComputeEx", ConcatComputeExCPU) .set_attr("FGradient", ConcatGrad{"_backward_Concat"}) .set_attr("key_var_num_args", "num_args") .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") @@ -180,7 +275,9 @@ NNVM_REGISTER_OP(_backward_Concat) }) .set_attr_parser(ParamParser) .set_attr("TIsBackward", true) -.set_attr("FCompute", ConcatGradCompute); +.set_attr("FInferStorageType", backward_ConcatStorageType) +.set_attr("FCompute", ConcatGradCompute) +.set_attr("FComputeEx", ConcatGradComputeExCPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc new file mode 100644 index 000000000000..c3de8a5c4f4f --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_concat.cc + * \brief + * \author Wenting Jiang +*/ +#include "../concat-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int concat_dim = param.dim; + std::vector data_md; + std::vector data_mem; + for(int i =0; i < num_in_data; i++) { + std::shared_ptr tmp_mem = in_data[i].GetMKLDNNData(); + auto tmp_pd = tmp_mem->get_primitive_desc(); + data_md.push_back(tmp_pd); + data_mem.push_back(*tmp_mem); + } + mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); + auto engine = CpuEngine::Instance().get_engine(); + auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], + fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); + MKLDNNStream::Instance().RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); + CommitOutput(out_data[concat_enum::kOut], out_mem); + MKLDNNStream::Instance().Submit(); +} + +void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs) { + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int axis_ = param.dim; + auto engine = CpuEngine::Instance().get_engine(); + std::shared_ptrgz_mem = inputs[0].GetMKLDNNData(); + mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); + /* init the offset */ + mkldnn::memory::dims offsets = {0, 0, 0, 0}; + for (int i = 0; i < num_in_data; i++) { + mkldnn::memory::dims diff_src_tz = {inputs[i+1].shape()[0], inputs[i+1].shape()[1], inputs[i+1].shape()[2], inputs[i+1].shape()[3]}; + auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); + auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); + // create view from gy to gxs[i] + std::shared_ptr view_pd; + view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); + // create reorder primitive from gy to gxs[i] + mkldnn::reorder::primitive_desc reorder_pd(view_pd.get()->dst_primitive_desc(), diff_src_mpd); + offsets[axis_] += diff_src_tz[axis_]; + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(reorder_pd, *gz_mem, *gradi_mem_.second)); + CommitOutput(outputs[i], gradi_mem_); + } + MKLDNNStream::Instance().Submit(); +} + +}//op +}//mxnet +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index ffeaf67fa74a..f8dde505e938 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -76,6 +76,14 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data); +/* For concat */ +void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + } } #endif // MXNET_USE_MKLDNN == 1 From 0e87c49133a4c021df6a3a86b0c6c8610cec70a2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 7 Dec 2017 22:16:29 +0000 Subject: [PATCH 140/264] simplify MKLDNN Flatten. --- src/operator/tensor/matrix_op.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 5b889bda1f50..d8ab9f9be724 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -130,16 +130,11 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 const auto in_stype = inputs[0].storage_type(); const auto out_stype = outputs[0].storage_type(); -#if MXNET_USE_MKLDNN == 1 if (in_stype == kMKLDNNStorage) { - NDArray data = inputs[0]; - if (data.shape().ndim() != 2) { - const TShape& oshape = outputs[0].shape(); - data = data.ReshapeMKLDNN(mshadow::Shape2(oshape[0], oshape[1])); - } - MKLDNNCopy(attrs, ctx, data, req[0], outputs[0]); + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); return; } // This happens if inputs are supposed to be in MKLDNN format From 771bbd44fae6041c139e40e798d13302c992959e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 00:33:44 +0000 Subject: [PATCH 141/264] Enalbe MKLDNN deconvolution with bias. --- .../nn/mkldnn/mkldnn_deconvolution.cc | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 7e849fd44d49..377fe760abd3 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -172,13 +172,15 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); CommitOutput(out_data[deconv::kOut], out_mem); MKLDNNStream::Instance().Submit(); + // add bias, broadcast bias to dim 1: channel if (!param.no_bias) { - // add bias, broadcast bias to dim 1: channel - // TODO this is problematic if the layout isn't expected. - // we need to handle the type correctly. + // MKLDNN only supports float right now. typedef float DType; Stream *s = ctx.get_stream(); Tensor bias = in_data[deconv::kBias].data().get(s); + // If the output data is stored in a special MKLDNN format, data() + // automatically converts its format to the default format. + // Unfortunately, MKLDNN doesn't support broadcast. Tensor out_cpu = out_data[deconv::kOut].data().get(s); out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_); } @@ -217,12 +219,17 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); CommitOutput(in_grad[deconv::kWeight], in_grad_weight); -// if (!param_.no_bias) { -// Tensor gbias = in_grad[deconv::kBias].get(s); -// Assign(gbias, req[deconv::kBias], sumall_except_dim<1>(grad)); -// } } MKLDNNStream::Instance().Submit(); + if (!param.no_bias) { + typedef float DType; + Stream *s = ctx.get_stream(); + Tensor gbias = in_grad[deconv::kBias].data().get(s); + // If there is bias, the out grad has already been converted to the default + // format, so this shouldn't cause any performance issues. + Tensor grad = inputs[deconv::kOut].data().get(s); + Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad)); + } } } From aebac39994ccff545c92b331b36add5d151e8dd0 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 01:40:41 +0000 Subject: [PATCH 142/264] Fix a bug in CuDNN deconvolution. --- src/operator/nn/cudnn/cudnn_deconvolution-inl.h | 3 ++- src/operator/nn/deconvolution.cc | 9 +++++++++ src/operator/nn/deconvolution.cu | 14 +++++--------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 7d309e09d589..2172ec0b4fe0 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -216,7 +216,7 @@ class CuDNNDeconvolutionOp { DType *data_ptr = NULL; DType *gdata_ptr = NULL; CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U); CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); if (param_.kernel.ndim() == 2) { @@ -247,6 +247,7 @@ class CuDNNDeconvolutionOp { CHECK_NE(req[deconv::kBias], kWriteInplace); } CHECK_NE(req[deconv::kData], kWriteInplace); + GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 19d5e915fb01..d86e2d3c7720 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -285,7 +285,11 @@ inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U); +#else CHECK_EQ(in_attrs->size(), 3U); +#endif CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 @@ -374,6 +378,11 @@ struct DeconvolutionGrad { std::vector heads(ograds.begin(), ograds.end()); heads.push_back(n->inputs[deconv::kData]); heads.push_back(n->inputs[deconv::kWeight]); +#if MXNET_USE_CUDNN == 1 + const DeconvolutionParam& param = nnvm::get(n->attrs.parsed); + if (!param.no_bias) + heads.push_back(n->inputs[deconv::kBias]); +#endif return MakeGradNode(op_name, n, heads, n->attrs.dict); } }; diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index e688e49ab20d..9e8840cade85 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -39,13 +39,9 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const Context& ctx, bool backward) { - // Convolution forward has to be called before backward for this operator. - // So we can't make this operator thread local. backward might be called - // in another thread. - static CuDNNDeconvolutionOp op; - if (!backward) - op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); + const Context& ctx) { + static thread_local CuDNNDeconvolutionOp op; + op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); return op; } #endif @@ -90,7 +86,7 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, in_shape[i] = inputs[i].shape_; } GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx.ctx, false).Forward(ctx, inputs, req, outputs); + in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs); } }) #else @@ -146,7 +142,7 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, in_shape[i] = in_data[i].shape_; } GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx.ctx, true).Backward(ctx, + in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) From b9f6f322a04ed9c77f5288643b5b57cdd42f5f72 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 02:24:27 +0000 Subject: [PATCH 143/264] avoid using MKLDNNStorage when it's not defined. --- src/executor/graph_executor.cc | 6 +++++- src/operator/tensor/elemwise_binary_scalar_op_basic.cc | 7 ++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index ca5da2ea8565..a39fddc97a36 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -55,7 +55,11 @@ GraphExecutor::~GraphExecutor() { } inline bool SharableStorage(NDArrayStorageType stype) { - return stype == kDefaultStorage || stype == kMKLDNNStorage; + bool ret = stype == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + ret = ret || stype == kMKLDNNStorage; +#endif + return ret; } inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape, diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index 8d2c4102684a..d557e9d6fb5c 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -53,11 +53,12 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a std::vector* in_attrs, std::vector* out_attrs) { bool dispatched = false; - if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage, #if MXNET_USE_MKLDNN == 1 - kMKLDNNStorage, nullptr + if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage, + kMKLDNNStorage, nullptr)) { +#else + if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { #endif - )) { dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage, dispatch_mode, From 9fe92e16405da6a51291200ac83700ae14c01a39 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 02:24:56 +0000 Subject: [PATCH 144/264] Remove ./cudnn_lrn-inl.h --- src/operator/nn/lrn.cc | 3 --- src/operator/nn/lrn.cu | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 21bf457512f2..a4b6b0e9a797 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -25,9 +25,6 @@ */ #include "./lrn-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_lrn-inl.h" -#endif namespace mxnet { namespace op { diff --git a/src/operator/nn/lrn.cu b/src/operator/nn/lrn.cu index 83dd1d0322ea..4c31ca96025c 100644 --- a/src/operator/nn/lrn.cu +++ b/src/operator/nn/lrn.cu @@ -25,9 +25,6 @@ */ #include "./lrn-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_lrn-inl.h" -#endif namespace mxnet { namespace op { From 64a4a28d1cbf73fcac8e8d2ddfdc489080441d2d Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 06:11:02 +0000 Subject: [PATCH 145/264] Fix for make lint. --- src/common/utils.cc | 4 +- src/ndarray/ndarray.cc | 98 +++++++++---------- src/operator/nn/concat-inl.h | 6 +- src/operator/nn/convolution.cc | 2 - src/operator/nn/deconvolution.cc | 2 - src/operator/nn/fully_connected.cc | 2 - src/operator/nn/lrn-inl.h | 6 +- src/operator/nn/lrn.cc | 2 +- src/operator/nn/mkldnn/mkldnn_act-inl.h | 14 +-- src/operator/nn/mkldnn/mkldnn_base-inl.h | 62 ++++++------ src/operator/nn/mkldnn/mkldnn_convolution.cc | 59 ++++++----- src/operator/nn/mkldnn/mkldnn_copy.cc | 7 +- .../nn/mkldnn/mkldnn_deconvolution.cc | 12 +-- .../nn/mkldnn/mkldnn_fully_connected.cc | 22 ++--- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 15 +-- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 43 +++++--- src/operator/nn/mkldnn/mkldnn_softmax.cc | 4 +- src/operator/nn/mkldnn/mkldnn_sum.cc | 5 +- src/operator/nn/pooling.cc | 20 ++-- .../tensor/elemwise_binary_op_basic.cc | 30 +++--- .../tensor/elemwise_binary_scalar_op_basic.cc | 7 +- .../tensor/elemwise_unary_op_basic.cc | 15 ++- src/operator/tensor/matrix_op.cc | 15 ++- 23 files changed, 225 insertions(+), 227 deletions(-) diff --git a/src/common/utils.cc b/src/common/utils.cc index 8f79fb870879..939b3e8d0a1b 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -50,8 +50,8 @@ std::string stype_string(const int x) { case kRowSparseStorage: return "row_sparse"; #if MXNET_USE_MKLDNN == 1 - case kMKLDNNStorage: - return "mkldnn"; + case kMKLDNNStorage: + return "mkldnn"; #endif } return "unknown"; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f826c54dcedf..67c3f1c3c3ba 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -22,7 +22,6 @@ * \file ndarray.cc * \brief ndarry module of mxnet */ -#include #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" @@ -48,10 +48,11 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { -static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, int dtype, const TShape &shape) { +static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, + int dtype, const TShape &shape) { #if MXNET_USE_MKLDNN == 1 - // We can't always generate a MKLDNN storage. If MKLDNN can't support the data type, - // we'll have to fall back to the default storage. + // We can't always generate a MKLDNN storage. If MKLDNN can't support + // the data type, we'll have to fall back to the default storage. if (stype == kMKLDNNStorage && !SupportMKLDNNArray(dtype, shape)) return kDefaultStorage; else @@ -158,15 +159,14 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { #if MXNET_USE_MKLDNN == 1 static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { - if (desc.data.ndims == 1) + if (desc.data.ndims == 1) { return desc.data.format; - else if (desc.data.ndims == 2) { + } else if (desc.data.ndims == 2) { if (desc.data.format == mkldnn_io) return mkldnn_oi; else return desc.data.format; - } - else if (desc.data.ndims == 4) { + } else if (desc.data.ndims == 4) { switch (desc.data.format) { case mkldnn_nchw: case mkldnn_nhwc: @@ -194,8 +194,7 @@ static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; return mkldnn_format_undef; } - } - else if (desc.data.ndims == 5) { + } else if (desc.data.ndims == 5) { switch (desc.data.format) { case mkldnn_goihw: case mkldnn_gOIhw8i8o: @@ -215,8 +214,7 @@ static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; return mkldnn_format_undef; } - } - else { + } else { LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; return mkldnn_format_undef; } @@ -287,9 +285,9 @@ NDArray NDArray::Reshape(const TShape &shape) const { auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } - else + } else { ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; + } } }, ctx(), {this->var()}, {ret.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); @@ -340,8 +338,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } - else { + } else { ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; } }, ctx(), {this->var()}, {ret.var()}, @@ -376,11 +373,13 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) { } NDArray NDArray::At(index_t idx) const { - CHECK(storage_type() == kDefaultStorage #if MXNET_USE_MKLDNN == 1 - || storage_type() == kMKLDNNStorage + CHECK(storage_type() == kDefaultStorage + || storage_type() == kMKLDNNStorage) +#else + CHECK(storage_type() == kDefaultStorage) #endif - ) << "Storage type " << storage_type() << " doesn't support At()"; + << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -390,11 +389,13 @@ NDArray NDArray::At(index_t idx) const { } NDArray NDArray::AtWithRecord(index_t idx) { - CHECK(storage_type() == kDefaultStorage #if MXNET_USE_MKLDNN == 1 - || storage_type() == kMKLDNNStorage + CHECK(storage_type() == kDefaultStorage + || storage_type() == kMKLDNNStorage) +#else + CHECK(storage_type() == kDefaultStorage) #endif - ) << "Storage type " << storage_type() << " doesn't support At()"; + << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->SliceWithRecord(idx, idx+1); if (shape_.ndim() > 1) { return ret.ReshapeWithRecord(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -450,7 +451,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data // in 5 dimensions while the NDArray stores data in 4 dimensions. - // TODO is it possible that the MKL memory is out-of-date? + // TODO(zhengda) is it possible that the MKL memory is out-of-date? if (Mkl_mem_ && storage_type == kMKLDNNStorage) { return; } @@ -462,22 +463,21 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { dims.resize(shape.ndim()); for (size_t i = 0; i < dims.size(); i++) dims[i] = shape[i]; - } - // If there are 3 dimensions, we'll force it to 4 dimensions. - else if (shape.ndim() == 3) { + } else if (shape.ndim() == 3) { + // If there are 3 dimensions, we'll force it to 4 dimensions. dims.resize(shape.ndim() + 1); dims[0] = 1; for (size_t i = 0; i < shape.ndim(); i++) dims[i + 1] = shape[i]; - } - else + } else { LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions"; + } mkldnn::memory::format layout = mkldnn::memory::format::format_undef; switch (dims.size()) { case 1: layout = mkldnn::memory::format::x; break; case 2: layout = mkldnn::memory::format::nc; break; case 4: layout = mkldnn::memory::format::nchw; break; - // TODO This isn't the right layout when the data has 5 dimensions in MXNet. + // This isn't the right layout when the data has 5 dimensions in MXNet. // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have // a corresponding format. case 5: layout = mkldnn::memory::format::goihw; break; @@ -491,9 +491,8 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { CheckAndAlloc(); Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, cpu_engine), shandle.dptr)); - } - // If the array uses MKLDNN storage, we need to allocate memory here. - else if (storage_type == kMKLDNNStorage) { + } else if (storage_type == kMKLDNNStorage) { + // If the array uses MKLDNN storage, we need to allocate memory here. Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, cpu_engine))); } @@ -528,9 +527,9 @@ std::shared_ptr NDArray::GetMKLDNNData( mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); MKLDNNStream::Instance().RegisterMem(ret); return ret; - } - else + } else { return nullptr; + } } std::shared_ptr NDArray::GetMKLDNNDataReorder( @@ -557,17 +556,15 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( mkldnn::memory::primitive_desc _desc = desc; // Now we need to determine if we should reorder the memory. // If both use the default formats, we think we don't need to reshape. - // TODO if the memory format isn't the default one, it may not work. auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); auto desc2 = _desc.desc(); - if (desc1.data.format == GetDefaultFormat(desc1) && + if (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2)) { mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); stream.RegisterMem(ret); return ret; - } - else { - // TODO we should manage the memory allocation here. + } else { + // TODO(zhengda) we should manage the memory allocation here. mkldnn_mem_ptr ret(new mkldnn::memory(desc)); stream.RegisterMem(ret); stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); @@ -576,14 +573,15 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( } std::shared_ptr NDArray::GetMKLDNNData() const { + CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); ptr_->SetMKLMem(shape_, dtype_); if (ptr_->Mkl_mem_) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; - } - else - // TODO We don't support converting sparse format. + } else { + // We don't support converting sparse format. return nullptr; + } } void NDArray::CopyFrom(const mkldnn::memory &mem) { @@ -607,18 +605,20 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { // In this case, we can simply create a new MKLDNN memory for the required // shape. - // TODO let's just hope it's the default format for now. + // TODO(zhengda) let's just hope it's the default format for now. CHECK_EQ(GetDefaultFormat(from_desc), from_desc.data.format); - mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims); - mkldnn::memory::desc data_md(dims, static_cast(this_desc.data.data_type), - static_cast(GetDefaultFormat(this_desc))); + mkldnn::memory::dims dims(this_desc.data.dims, + this_desc.data.dims + this_desc.data.ndims); + auto this_dtype = static_cast(this_desc.data.data_type); + auto this_format = static_cast(GetDefaultFormat(this_desc)); + mkldnn::memory::desc data_md(dims, this_dtype, this_format); mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); stream.RegisterMem(tmp_mem); stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); - } - else + } else { stream.RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + } } std::shared_ptr NDArray::CreateMKLDNNData( @@ -668,7 +668,7 @@ void NDArray::SetTBlob() const { ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_); else ptr_->SetMKLMem(shape_, dtype_); - dptr = (char *) ptr_->Mkl_mem_->get_data_handle(); + dptr = static_cast(ptr_->Mkl_mem_->get_data_handle()); #endif } else { LOG(FATAL) << "unknown storage type " << stype; diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h index 411ad23eea8b..dc6a6c04fb52 100644 --- a/src/operator/nn/concat-inl.h +++ b/src/operator/nn/concat-inl.h @@ -23,8 +23,8 @@ * \brief * \author Bing Xu */ -#ifndef MXNET_OPERATOR_CONCAT_INL_H_ -#define MXNET_OPERATOR_CONCAT_INL_H_ +#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_ +#define MXNET_OPERATOR_NN_CONCAT_INL_H_ #include #include #include @@ -156,4 +156,4 @@ void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONCAT_INL_H_ +#endif // MXNET_OPERATOR_NN_CONCAT_INL_H_ diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index e748ad0ea32a..8513e23d5036 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -57,7 +57,6 @@ static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); @@ -76,7 +75,6 @@ static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index d86e2d3c7720..25d971bd5994 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -315,7 +315,6 @@ static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); @@ -334,7 +333,6 @@ static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index b2281696fc93..dbaae27ad764 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -82,7 +82,6 @@ void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ct return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); @@ -101,7 +100,6 @@ void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h index 2dfecea0bde1..fdae1eca0aef 100644 --- a/src/operator/nn/lrn-inl.h +++ b/src/operator/nn/lrn-inl.h @@ -23,8 +23,8 @@ * \brief * \author Bing Xu */ -#ifndef MXNET_OPERATOR_LRN_INL_H_ -#define MXNET_OPERATOR_LRN_INL_H_ +#ifndef MXNET_OPERATOR_NN_LRN_INL_H_ +#define MXNET_OPERATOR_NN_LRN_INL_H_ #include #include #include @@ -124,4 +124,4 @@ void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_LRN_INL_H_ +#endif // MXNET_OPERATOR_NN_LRN_INL_H_ diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index a4b6b0e9a797..53769c1c4c7d 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -70,7 +70,7 @@ struct LRNGrad { std::vector operator()(const nnvm::NodePtr& n, const std::vector& ograds) const { std::vector heads; - heads.push_back(ograds[0]); // out_grad + heads.push_back(ograds[0]); // out_grad heads.push_back(n->inputs[lrn_enum::kData]); heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0}); return MakeGradNode(op_name, n, heads, n->attrs.dict); diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index eebd65390836..be625b87e3d2 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -23,8 +23,8 @@ * \author Da Zheng */ -#ifndef MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ -#define MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ #include @@ -80,10 +80,10 @@ void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc desc = ctx.is_train - ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, - alg, data_md, alpha) - : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, - alg, data_md, alpha); + ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha) + : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, + alg, data_md, alpha); mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); std::shared_ptr output_memory @@ -128,4 +128,4 @@ void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, } // namespace mxnet #endif -#endif // MXNET_OPERATOR_MKL_MKLDNN_ACT_INL_H_ +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 5c04071a7783..53ded72ac642 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -23,14 +23,15 @@ * *******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ -#define MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ #if MXNET_USE_MKLDNN == 1 #include #include #include #include +#include #include "mkldnn.hpp" using namespace mkldnn; namespace mxnet { @@ -142,9 +143,11 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, return GetMemDesc(arr); } else { CHECK_EQ(arr.shape().ndim(), 4U); - mkldnn::memory::dims tz = mkldnn::memory::dims{ - num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], - (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), mkldnn::memory::format::any}; } @@ -178,7 +181,7 @@ class MKLDNNStream { inline static mkldnn_mem_ptr CreateMKLDNNMem( const mkldnn::memory::primitive_desc &desc) { - // TODO allocate memory more efficiently. + // TODO(zhengda) allocate memory more efficiently. std::shared_ptr ret(new mkldnn::memory(desc)); MKLDNNStream::Instance().RegisterMem(ret); return ret; @@ -195,9 +198,9 @@ typedef std::pair mkldnn_output_t; static inline mkldnn_output_t CreateMKLDNNMem( const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req) { - if (kAddTo == req) + if (kAddTo == req) { return mkldnn_output_t(OutDataOp::AddBack, CreateMKLDNNMem(desc)); - else { + } else { mkldnn_mem_ptr mem = const_cast(arr).CreateMKLDNNData(desc); if (mem == nullptr) return mkldnn_output_t(OutDataOp::CopyBack, CreateMKLDNNMem(desc)); @@ -213,10 +216,9 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, static inline void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { - if (res.first == CopyBack) + if (res.first == CopyBack) { const_cast(arr).CopyFrom(*res.second); - else if (res.first == AddBack) { - // TODO I might need to reorder. + } else if (res.first == AddBack) { mkldnn_mem_const_ptr mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); CHECK(mem != nullptr); @@ -236,26 +238,28 @@ inline static mkldnn_mem_const_ptr GetWeights( mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Instance().get_engine(); if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = - mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = - mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1], - (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; mem = arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], - (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = @@ -277,26 +281,28 @@ inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, int num_groups = 1) { mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = - mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = - mkldnn::memory::dims{(int)arr.shape()[0], (int)arr.shape()[1], - (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; mkldnn::memory::primitive_desc pd = mkldnn::memory::primitive_desc{md, engine}; return arr.GetMKLDNNData(pd); } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - num_groups, (int)arr.shape()[0] / num_groups, (int)arr.shape()[1], - (int)arr.shape()[2], (int)arr.shape()[3]}; + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; mkldnn::memory::desc md = mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; mkldnn::memory::primitive_desc pd = @@ -310,4 +316,4 @@ inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, } // namespace mxnet #endif -#endif // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_ +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index e152a29fc92f..a6e756fe9499 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -53,15 +53,13 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); - } - else if (param.dilate.ndim() == 0) { + } else if (param.dilate.ndim() == 0) { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md, weight_md, bias_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); - } - else { + } else { mkldnn::memory::dims dilates{0, 0}; if (param.dilate.ndim() == 2) { dilates[0] = param.dilate[0] - 1; @@ -72,12 +70,12 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( data_md, weight_md, out_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); - } - else { + } else { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, - mkldnn::padding_kind::zero); + data_md, weight_md, bias_md, out_md, strides, + dilates, padding, padding, + mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); } } @@ -104,8 +102,7 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); - } - else { + } else { mkldnn::memory::dims dilates{0, 0}; if (param.dilate.ndim() == 2) { dilates[0] = param.dilate[0] - 1; @@ -140,15 +137,13 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } - else if (param.dilate.ndim() == 0) { + } else if (param.dilate.ndim() == 0) { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md, weight_md, bias_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } - else { + } else { mkldnn::memory::dims dilates{0, 0}; if (param.dilate.ndim() == 2) { dilates[0] = param.dilate[0] - 1; @@ -159,20 +154,21 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( data_md, weight_md, out_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } - else { + } else { auto bias_md = GetMemDesc(*bias); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, strides, dilates, padding, padding, - mkldnn::padding_kind::zero); + data_md, weight_md, bias_md, out_md, + strides, dilates, padding, padding, + mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); } } } void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { const ConvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], @@ -223,24 +219,27 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c } if (req[conv::kWeight]) { mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd - = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], - param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); + = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], + inputs[conv::kOut], fwd_pd); auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], - bwdWeights_pd.diff_weights_primitive_desc(), req[conv::kWeight]); + bwdWeights_pd.diff_weights_primitive_desc(), + req[conv::kWeight]); mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], - bwdWeights_pd.diff_bias_primitive_desc(), req[conv::kBias]); + bwdWeights_pd.diff_bias_primitive_desc(), + req[conv::kBias]); MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, - *in_grad_bias.second)); + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); } CommitOutput(in_grad[conv::kWeight], in_grad_weight); CommitOutput(in_grad[conv::kBias], in_grad_bias); @@ -248,7 +247,7 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c MKLDNNStream::Instance().Submit(); } -} -} +} // namespace op +} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc index 6f1975dd279b..aa141f1e3e09 100644 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -46,13 +46,12 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, MKLDNNStream::Instance().RegisterMem(sum_res); Sum(*in_mem, *out_mem, *sum_res); const_cast(out_data).CopyFrom(*sum_res); - } - else { + } else { const_cast(out_data).CopyFrom(*in_mem); } MKLDNNStream::Instance().Submit(); } -} -} +} // namespace op +} // namespace mxnet #endif diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 377fe760abd3..cca73e3d9445 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -50,8 +50,7 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_forward::primitive_desc(desc, engine); - } - else { + } else { auto bias_md = GetBiasDesc(data_md); mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md, @@ -143,8 +142,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero); return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } - else { + } else { auto bias_md = GetBiasDesc(data_md); mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding, @@ -232,7 +230,7 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext } } -} -} +} // namespace op +} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 17f504b9062e..f3ebb055d1b4 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -41,8 +41,7 @@ inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, data_md, weight_md, bias_md, out_md); return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); - } - else { + } else { mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, data_md, weight_md, out_md); return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); @@ -73,8 +72,7 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei weight_md, bias_md, out_md); return mkldnn::inner_product_backward_weights::primitive_desc( ipBwdWeights_desc, engine, ipFwd_pd); - } - else { + } else { mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, weight_md, out_md); return mkldnn::inner_product_backward_weights::primitive_desc( @@ -94,16 +92,14 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, if (data.shape().ndim() != 2 && !param.flatten) { data = data.ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); - // TODO this can potentially be a problem when casting the type. - mkldnn::memory::dims out_dims{(int) oshape.ProdShape(0, oshape.ndim()-1), - (int) oshape[ishape.ndim()-1]}; + mkldnn::memory::dims out_dims{static_cast(oshape.ProdShape(0, oshape.ndim()-1)), + static_cast(oshape[ishape.ndim()-1])}; out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), mkldnn::memory::format::any); - } - else if (data.shape().ndim() != 2) { + } else if (data.shape().ndim() != 2) { data = data.ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); - // TODO this can potentially be a problem when casting the type. - mkldnn::memory::dims out_dims{(int) oshape[0], (int) oshape.ProdShape(1, oshape.ndim())}; + mkldnn::memory::dims out_dims{static_cast(oshape[0]), + static_cast(oshape.ProdShape(1, oshape.ndim()))}; out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), mkldnn::memory::format::any); } @@ -192,6 +188,6 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, MKLDNNStream::Instance().Submit(); } -} -} +} // namespace op +} // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index f8dde505e938..92ec12cf5e36 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -23,7 +23,6 @@ * \author Da Zheng */ -#include #include #include #include @@ -31,6 +30,8 @@ #include #include #include +#include +#include #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ @@ -76,16 +77,8 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data); -/* For concat */ -void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data); -void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs); - -} -} +} // namespace op +} // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 #endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 309cd510a4a1..06def9c28dc4 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -21,8 +21,11 @@ * \file mkldnn_pooling.cc * \brief */ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ #if MXNET_USE_MKLDNN == 1 + #include #include "../pooling-inl.h" #include "./mkldnn_base-inl.h" @@ -100,10 +103,15 @@ inline static pooling_forward::primitive_desc GetPoolingFwd( if (is_train && alg != algorithm::pooling_avg) { kind = prop_kind::forward_training; } - pooling_forward::desc poolingFwd_desc( - kind, alg, data_md, out_md, {(int)stride_h_, (int)stride_w_}, - {kernel_h_, kernel_w_}, {(int)pad_t_, (int)pad_l_}, {(int)pad_b_, (int)pad_r_}, - padding_kind::zero); + pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md, + {static_cast(stride_h_), + static_cast(stride_w_)}, + {kernel_h_, kernel_w_}, + {static_cast(pad_t_), + static_cast(pad_l_)}, + {static_cast(pad_b_), + static_cast(pad_r_)}, + padding_kind::zero); return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine); } @@ -119,7 +127,8 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, auto data_md = data_mpd.desc(); memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], - (int)out_data.shape()[2], (int)out_data.shape()[3]}; + static_cast(out_data.shape()[2]), + static_cast(out_data.shape()[3])}; memory::desc out_md({dims}, static_cast(data_md.data.data_type), static_cast(data_md.data.format)); @@ -156,7 +165,8 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], - (int)out_grad.shape()[2], (int)out_grad.shape()[3]}; + static_cast(out_grad.shape()[2]), + static_cast(out_grad.shape()[3])}; memory::desc out_md({dims}, static_cast(data_md.data.data_type), static_cast(data_md.data.format)); @@ -164,7 +174,8 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc(); memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1], - (int)in_grad.shape()[2], (int)in_grad.shape()[3]}; + static_cast(in_grad.shape()[2]), + static_cast(in_grad.shape()[3])}; memory::desc diff_in_md( {dims1}, static_cast(diff_md.data.data_type), static_cast(diff_md.data.format)); @@ -180,10 +191,15 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, kernel_h_ = param.kernel[0]; kernel_w_ = param.kernel[1]; } - pooling_backward::desc desc( - alg, diff_in_md, diff_md, {(int)param.stride[0], (int)param.stride[1]}, - {kernel_h_, kernel_w_}, {(int)param.pad[0], (int)param.pad[1]}, - {(int)param.pad[0], (int)param.pad[1]}, padding_kind::zero); + pooling_backward::desc desc(alg, diff_in_md, diff_md, + {static_cast(param.stride[0]), + static_cast(param.stride[1])}, + {kernel_h_, kernel_w_}, + {static_cast(param.pad[0]), + static_cast(param.pad[1])}, + {static_cast(param.pad[0]), + static_cast(param.pad[1])}, + padding_kind::zero); pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd); auto diff_src_mem = @@ -203,6 +219,7 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, CommitOutput(in_grad, diff_src_mem); MKLDNNStream::Instance().Submit(); } -} -} +} // namespace op +} // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc index 1cf965915489..f5eff39986d7 100644 --- a/src/operator/nn/mkldnn/mkldnn_softmax.cc +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -50,6 +50,6 @@ void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, stream.Submit(); } -} -} +} // namespace op +} // namespace mxnet #endif diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index 5645b276656f..9f5c5a319c60 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -43,6 +43,7 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, scales[1] = 1; inputs.push_back(arr1); inputs.push_back(arr2); + // TODO(zhengda) I need to reorder memory here. mkldnn::sum::primitive_desc sum_pd(scales, input_pds); MKLDNNStream::Instance().RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); } @@ -68,6 +69,6 @@ void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, stream.Submit(); } -} -} +} // namespace op +} // namespace mxnet #endif diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index c17a879df453..ed20a7cf347f 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -114,7 +114,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) - out_shape->push_back(oshape); // for workspace + out_shape->push_back(oshape); // for workspace #endif } else if (param_.kernel.ndim() == 2) { CHECK_EQ(dshape.ndim(), 4U) @@ -153,7 +153,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) - out_shape->push_back(oshape); // for workspace + out_shape->push_back(oshape); // for workspace #endif } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) @@ -199,7 +199,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs, out_shape->push_back(oshape); // save output shape #if MXNET_USE_MKLDNN == 1 if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_)) - out_shape->push_back(oshape); // for workspace + out_shape->push_back(oshape); // for workspace #endif } return true; @@ -223,7 +223,6 @@ void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, return; } #endif - // TODO I need to convert format. std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); // We know pooling has only one output. @@ -249,8 +248,7 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, CHECK_EQ(inputs.size(), 5U); in_data = &inputs[2]; workspace = &inputs[4]; - } - else { + } else { CHECK_EQ(inputs.size(), 3U); in_data = &inputs[1]; } @@ -262,19 +260,17 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, return; } #endif - // TODO I need to convert format. std::vector in_blobs(3); // In this case, there isn't workspace in the input arrays. if (inputs.size() == 3) { for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); - } - else { + } else { // There is workspace among the input arrays. One for out_grad and one for // input. - in_blobs[0] = inputs[0].data(); // out grad - in_blobs[1] = inputs[2].data(); // in data - in_blobs[2] = inputs[3].data(); // out data + in_blobs[0] = inputs[0].data(); // out grad + in_blobs[1] = inputs[2].data(); // in data + in_blobs[2] = inputs[3].data(); // out data } std::vector out_blobs(outputs.size()); for (size_t i = 0; i < out_blobs.size(); i++) diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 4d51d5b0e0b6..4a3985d9cdbb 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -41,12 +41,11 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, || inputs[1].storage_type() == kMKLDNNStorage) { MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); return; - } - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - else if (inputs[0].storage_type() == kDefaultStorage - || inputs[1].storage_type() == kDefaultStorage) { + } else if (inputs[0].storage_type() == kDefaultStorage + || inputs[1].storage_type() == kDefaultStorage) { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. std::vector in_blobs(2); std::vector out_blobs(1); in_blobs[0] = inputs[0].data(); @@ -74,10 +73,10 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, out_attrs->at(0) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; return true; - } else + } #endif - return ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); } MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) @@ -115,10 +114,11 @@ static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, if (inputs[0].storage_type() == kMKLDNNStorage) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]); - } else + return; + } #endif - ElemwiseBinaryOp::BackwardUseNoneEx( - attrs, ctx, inputs, req, outputs); + ElemwiseBinaryOp::BackwardUseNoneEx( + attrs, ctx, inputs, req, outputs); } static inline bool _backward_ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, @@ -134,10 +134,10 @@ static inline bool _backward_ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs out_attrs->at(1) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; return true; - } else + } #endif - return ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); } NNVM_REGISTER_OP(_backward_add) diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index d557e9d6fb5c..82e497af67dc 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -86,11 +86,12 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs, const auto in_stype = in_attrs->at(0); auto &out_stype = out_attrs->at(0); bool dispatched = false; - if (!dispatched && (in_stype == kDefaultStorage #if MXNET_USE_MKLDNN == 1 - || in_stype == kMKLDNNStorage + if (!dispatched && (in_stype == kDefaultStorage + || in_stype == kMKLDNNStorage)) { +#else + if (!dispatched && (in_stype == kDefaultStorage)) { #endif - )) { // dns -> dns dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 203673a4b247..24505d438c5e 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -122,11 +122,10 @@ static void CopyEx(const nnvm::NodeAttrs& attrs, if (in_stype == kMKLDNNStorage) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); return; - } - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - else if (inputs[0].storage_type() == kDefaultStorage) { + } else if (inputs[0].storage_type() == kDefaultStorage) { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. std::vector in_blobs(1); std::vector out_blobs(1); in_blobs[0] = inputs[0].data(); @@ -150,10 +149,10 @@ static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs, out_attrs->at(0) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; return true; - } else + } #endif - return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); } MXNET_OPERATOR_REGISTER_UNARY(_copy) diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index d8ab9f9be724..a7318ca78d1b 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -136,11 +136,10 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs, if (in_stype == kMKLDNNStorage) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); return; - } - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - else if (in_stype == kDefaultStorage) { + } else if (in_stype == kDefaultStorage) { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. std::vector in_blobs(1); std::vector out_blobs(1); in_blobs[0] = inputs[0].data(); @@ -163,10 +162,10 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs, out_attrs->at(0) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; return true; - } else + } #endif - return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); } NNVM_REGISTER_OP(Flatten) From 6a973b9a6bc3e300de5bef0e617b448f1cfdb094 Mon Sep 17 00:00:00 2001 From: wentingj Date: Thu, 7 Dec 2017 16:52:00 +0800 Subject: [PATCH 146/264] add mkldnn surport for concat --- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 92ec12cf5e36..99da180e3cc5 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -77,8 +77,16 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data); -} // namespace op -} // namespace mxnet +/* For concat */ +void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data); +void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, const std::vector& req, + const std::vector& outputs); + +} +} #endif // MXNET_USE_MKLDNN == 1 #endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ From 337b2ecf17572ca3abbf3a863e0fc352a9478abf Mon Sep 17 00:00:00 2001 From: wentingj Date: Fri, 8 Dec 2017 17:39:00 +0800 Subject: [PATCH 147/264] fix the coding style for pr of mkldnn concat --- src/operator/nn/concat.cc | 14 +++++--------- src/operator/nn/mkldnn/mkldnn_concat.cc | 19 +++++++++++-------- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 8 ++++---- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index d17bf8054238..bbd41cb0153d 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -153,14 +153,12 @@ void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(req.size(), 1U); if (req[0] == kNullOp) return; #if MXNET_USE_MKLDNN == 1 - //MKLDNN support 2D and 4D concat + // MKLDNN support 2D and 4D concat if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { - if(inputs[0].dtype() == mshadow::kFloat32) { + if (inputs[0].dtype() == mshadow::kFloat32) { MKLDNNConcat_Forward(attrs, op_ctx, inputs, req, outputs); } - } - else { - // TODO I need to convert format. + } else { std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); @@ -177,12 +175,10 @@ static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { - if(inputs[0].dtype() == mshadow::kFloat32) { + if (inputs[0].dtype() == mshadow::kFloat32) { MKLDNNConcat_Backward(attrs, ctx, inputs, req, outputs); } - } - else { - // TODO I need to convert format. + } else { std::vector in_blobs(1); in_blobs[0] = inputs[0].data(); std::vector out_blobs(outputs.size()); diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index c3de8a5c4f4f..8171784d56cf 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -38,13 +38,13 @@ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, int concat_dim = param.dim; std::vector data_md; std::vector data_mem; - for(int i =0; i < num_in_data; i++) { + for (int i =0; i < num_in_data; i++) { std::shared_ptr tmp_mem = in_data[i].GetMKLDNNData(); auto tmp_pd = tmp_mem->get_primitive_desc(); data_md.push_back(tmp_pd); data_mem.push_back(*tmp_mem); } - mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); + mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); auto engine = CpuEngine::Instance().get_engine(); auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); @@ -61,25 +61,28 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, int axis_ = param.dim; auto engine = CpuEngine::Instance().get_engine(); std::shared_ptrgz_mem = inputs[0].GetMKLDNNData(); - mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); + mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); /* init the offset */ mkldnn::memory::dims offsets = {0, 0, 0, 0}; for (int i = 0; i < num_in_data; i++) { - mkldnn::memory::dims diff_src_tz = {inputs[i+1].shape()[0], inputs[i+1].shape()[1], inputs[i+1].shape()[2], inputs[i+1].shape()[3]}; + mkldnn::memory::dims diff_src_tz = {inputs[i+1].shape()[0], inputs[i+1].shape()[1], + inputs[i+1].shape()[2], inputs[i+1].shape()[3]}; auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); // create view from gy to gxs[i] std::shared_ptr view_pd; view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); // create reorder primitive from gy to gxs[i] - mkldnn::reorder::primitive_desc reorder_pd(view_pd.get()->dst_primitive_desc(), diff_src_mpd); + mkldnn::reorder::primitive_desc reorder_pd( + view_pd.get()->dst_primitive_desc(), diff_src_mpd); offsets[axis_] += diff_src_tz[axis_]; - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(reorder_pd, *gz_mem, *gradi_mem_.second)); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder( + reorder_pd, *gz_mem, *gradi_mem_.second)); CommitOutput(outputs[i], gradi_mem_); } MKLDNNStream::Instance().Submit(); } -}//op -}//mxnet +} // namespace op +} // namespace mxnet #endif diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 99da180e3cc5..4d2543dc6f25 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -74,8 +74,8 @@ void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, /* For copy */ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data); + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); /* For concat */ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -85,8 +85,8 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs); -} -} +} // namespace op +} // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 #endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ From c290f6eb2a448daeb430737c4197c52714ddeb23 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 09:13:19 -0800 Subject: [PATCH 148/264] Only add input data for MKLDNN concat backward --- src/operator/nn/concat.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index bbd41cb0153d..f660b335af0b 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -195,9 +195,11 @@ struct ConcatGrad { const std::vector& ograds) const { CHECK_EQ(ograds.size(), 1); std::vector heads(ograds.begin(), ograds.end()); +#if MXNET_USE_MKLDNN == 1 for (size_t i = 0; i < n->inputs.size(); i++) { heads.push_back(n->inputs[i]); } +#endif return MakeGradNode(op_name, n, heads, n->attrs.dict); } }; From a2d8906e1350410ef42488269edfaf561177ac45 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 18:51:30 +0000 Subject: [PATCH 149/264] Remove unnecessary TODO. --- python/mxnet/ndarray/mkldnn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py index e90fd77a34db..42386a558396 100644 --- a/python/mxnet/ndarray/mkldnn.py +++ b/python/mxnet/ndarray/mkldnn.py @@ -59,14 +59,12 @@ def __repr__(self): return '\n<%s %s @%s>' % (self.__class__.__name__, shape_info, self.context) - # TODO def _at(self, idx): raise NotSupportedForMKLNDArray(self._at, '[idx]', idx) def _slice(self, start, stop): return op.slice(self, begin=start, end=stop) - # TODO def astype(self, dtype): """Returns a copy of the array after casting to a specified type. Parameters @@ -85,7 +83,6 @@ def astype(self, dtype): self.copyto(res) return res - # TODO def copyto(self, other): """Copies the value of this array to another array. From 57c2b3d1d16447d0f920b4a3151a6f537ee912dd Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 19:08:01 +0000 Subject: [PATCH 150/264] remove unnecessary __repr__ in MKLNDArray. --- python/mxnet/ndarray/mkldnn.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py index 42386a558396..d1b065abfb76 100644 --- a/python/mxnet/ndarray/mkldnn.py +++ b/python/mxnet/ndarray/mkldnn.py @@ -52,13 +52,6 @@ class MKLNDArray(NDArray): """The base class of an NDArray stored in a MKLDNN storage format. """ - def __repr__(self): - """Returns a string representation of the sparse array.""" - shape_info = 'x'.join(['%d' % x for x in self.shape]) - # The data content is not displayed since the array usually has big shape - return '\n<%s %s @%s>' % (self.__class__.__name__, - shape_info, self.context) - def _at(self, idx): raise NotSupportedForMKLNDArray(self._at, '[idx]', idx) From 43b6f5cb4ca11b9ae6ccf72338df68a131fa8a97 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 19:08:51 +0000 Subject: [PATCH 151/264] better condition check for readability. --- src/kvstore/kvstore_local.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 41b5b3030dd8..5646d9eef866 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -256,13 +256,13 @@ class KVStoreLocal : public KVStore { auto validator = [this](const int key, const NDArray& nd) -> bool { auto stype = nd.storage_type(); // valid NDArray - if (stype == kDefaultStorage || stype == kRowSparseStorage - // When it's kMKLDNNStorage, it'll be converted to a data layout - // compatible to the default storage. + auto valid_stype = stype == kDefaultStorage || stype == kRowSparseStorage; #if MXNET_USE_MKLDNN == 1 - || stype == kMKLDNNStorage + // When it's kMKLDNNStorage, it'll be converted to a data layout + // compatible to the default storage. + valid_stype = valid_stype || stype == kMKLDNNStorage; #endif - ) return true; + if (valid_stype) return true; // invalid NDArray, abort LOG(FATAL) << "Unexpected storage type detected during kvstore push: " << stype; return false; @@ -280,13 +280,13 @@ class KVStoreLocal : public KVStore { auto validator = [this](const int key, const NDArray* nd) -> bool { auto stype = nd->storage_type(); // valid - if (stype == kDefaultStorage - // When it's kMKLDNNStorage, it'll be converted to a data layout - // compatible to the default storage. + auto valid_stype = stype == kDefaultStorage; #if MXNET_USE_MKLDNN == 1 - || stype == kMKLDNNStorage + // When it's kMKLDNNStorage, it'll be converted to a data layout + // compatible to the default storage. + valid_stype = valid_stype || stype == kMKLDNNStorage; #endif - ) return true; + if (valid_stype) return true; // invalid, print warning messages once if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) { LOG(INFO) << "Warning: non-default weights detected during kvstore pull. " From b3f7d6ebf7a4d5fb2af5bdb0d891bf4866ab9126 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 19:10:06 +0000 Subject: [PATCH 152/264] Use macro when including mkldnn.hpp. --- src/ndarray/ndarray.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 67c3f1c3c3ba..a467a9b06578 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -31,7 +31,9 @@ #include #include #include +#if MXNET_USE_MKLDNN == 1 #include +#endif #include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" From ead065f8969af4f6e38e6d79db816c91ccd699f6 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 8 Dec 2017 20:07:53 +0000 Subject: [PATCH 153/264] Revert "Use CoreOpRunner for refactored Ops." This reverts commit a28586fc25950cc006cb317e26e0d17541ef0586. --- tests/cpp/operator/activation_perf.cc | 4 ++-- tests/cpp/operator/fully_conn_perf.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index fe51be533510..e482848705ad 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -41,7 +41,7 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); - test::op::CoreOpRunner runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, { shape }, kwargs, 1); } @@ -52,7 +52,7 @@ TEST(ACTIVATION_PERF, TimingCPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::op::CoreOpRunner runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index 8c32e51e3161..c8d8021f6f6e 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -41,7 +41,7 @@ const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} }; TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { TShape shape({5, 5}); kwargs_t kwargs = basic_fullyconn_args; - test::op::CoreOpRunner runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, { shape }, kwargs, 1); } @@ -50,7 +50,7 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; - test::op::CoreOpRunner runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache From 447c2d2324a2d7421f61cdb77140bf17fa421019 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 00:58:16 +0000 Subject: [PATCH 154/264] Fix a bug in test core. --- tests/cpp/include/test_core_op.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 6a220bdad6d7..1074d0182198 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -314,7 +314,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer // Set up forward attrs_ = ParseAttrs(op_, args); - const int num_inputs = op_->num_inputs; + int num_inputs = op_->num_inputs; + if (op_->get_num_inputs) + num_inputs = op_->get_num_inputs(attrs_); if (!inputs.empty()) { CHECK_EQ(inputs.size(), static_cast(num_inputs)); From 0cf3789797ebb6caff882527cb683a4d710e7055 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 01:30:40 +0000 Subject: [PATCH 155/264] Limit MKLDNN ops being used. --- src/operator/nn/activation.cc | 10 ++++++++-- src/operator/nn/convolution.cc | 22 ++++++++++++++++------ src/operator/nn/deconvolution.cc | 14 ++++++++++++-- src/operator/nn/fully_connected.cc | 10 ++++++++-- src/operator/nn/pooling.cc | 10 ++++++++-- 5 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 5374495151ff..a9e438c6c1cf 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -98,7 +98,10 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param) + // There is no reason to use MKLDNN activation if the input isn't in + // MKLDNN format. + && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -121,7 +124,10 @@ inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param) + // There is no reason to use MKLDNN activation if the input isn't in + // MKLDNN format. + && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 8513e23d5036..4b7e0dac337f 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -293,17 +293,22 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs, } inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 2 : 3; CHECK_EQ(in_attrs->size(), in_expected); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + // We should allow MKLDNN conv to apply to the default storage as well. + // Even with format conversion, MKLDNN conv should still be faster than + // the native implementation. + && (in_attrs->at(0) == kMKLDNNStorage + || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -326,7 +331,12 @@ inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + // We should allow MKLDNN conv to apply to the default storage as well. + // Even with format conversion, MKLDNN conv should still be faster than + // the native implementation. + && (in_attrs->at(0) == kMKLDNNStorage + || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 25d971bd5994..6e826ce18b1d 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -267,7 +267,12 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + // We should allow MKLDNN conv to apply to the default storage as well. + // Even with format conversion, MKLDNN conv should still be faster than + // the native implementation. + && (in_attrs->at(0) == kMKLDNNStorage + || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -293,7 +298,12 @@ inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + // We should allow MKLDNN conv to apply to the default storage as well. + // Even with format conversion, MKLDNN conv should still be faster than + // the native implementation. + && (in_attrs->at(0) == kMKLDNNStorage + || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index dbaae27ad764..57b4bdf3e30a 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -138,7 +138,10 @@ inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + // The native implementation uses BLAS. It shouldn't be slower than MKLDNN + // FC. If the input data has the default format, there is format conversion + // overhead as well. + if (dev_mask == mshadow::cpu::kDevMask && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -160,7 +163,10 @@ inline static bool backward_FCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + // The native implementation uses BLAS. It shouldn't be slower than MKLDNN + // FC. If the input data has the default format, there is format conversion + // overhead as well. + if (dev_mask == mshadow::cpu::kDevMask && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index ed20a7cf347f..19de29684cc7 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -300,7 +300,10 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) + // There is no reason to use MKLDNN pooling if the input isn't in + // MKLDNN format. + && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; @@ -322,7 +325,10 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) + // There is no reason to use MKLDNN pooling if the input isn't in + // MKLDNN format. + && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; From f6654d8c0fd9b72f652a8c1d03a333a958ffd799 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 01:44:06 +0000 Subject: [PATCH 156/264] Fix complains from "make pylint" --- python/mxnet/ndarray/mkldnn.py | 22 +++------------------- python/mxnet/ndarray/sparse.py | 3 ++- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py index d1b065abfb76..85544266015f 100644 --- a/python/mxnet/ndarray/mkldnn.py +++ b/python/mxnet/ndarray/mkldnn.py @@ -22,31 +22,16 @@ from __future__ import absolute_import from __future__ import division -try: - from __builtin__ import slice as py_slice - from __builtin__ import sum as py_sum -except ImportError: - from builtins import slice as py_slice - from builtins import sum as py_sum -import ctypes import warnings -__all__ = ["_ndarray_cls", "MKLNDArray"] +__all__ = ["MKLNDArray"] -import numpy as np -from ..base import _LIB, numeric_types -from ..base import c_array, mx_real_t, integer_types -from ..base import mx_uint, NDArrayHandle, check_call from ..context import Context from . import _internal from . import op -from ._internal import _set_ndarray_class -from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_MKLDNN -from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT -from .ndarray import zeros as _zeros_ndarray -from .ndarray import array as _array +from .ndarray import NDArray +from .sparse import _ndarray_cls class MKLNDArray(NDArray): """The base class of an NDArray stored in a MKLDNN storage format. @@ -100,4 +85,3 @@ def copyto(self, other): return _internal._copyto(self, out=hret) else: raise TypeError('copyto does not support type ' + str(type(other))) - diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py index d1f5e91b2c8c..e29d3da9b476 100644 --- a/python/mxnet/ndarray/sparse.py +++ b/python/mxnet/ndarray/sparse.py @@ -49,7 +49,8 @@ pass from ._internal import _set_ndarray_class from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR, _STORAGE_TYPE_MKLDNN +from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR +from .ndarray import _STORAGE_TYPE_MKLDNN from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT from .ndarray import zeros as _zeros_ndarray from .ndarray import array as _array From aa6c19fb0fe5d4fbe66e5f77368d59d062d046c7 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 02:39:56 +0000 Subject: [PATCH 157/264] Move ContainStorage to common/utils.h --- src/common/utils.h | 22 +++++++++++++++++++ src/operator/tensor/elemwise_sum.cc | 34 ++++------------------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/src/common/utils.h b/src/common/utils.h index fcdf402fafed..5a06bdb53cac 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -218,6 +218,28 @@ void CheckFormatImpl(const RunContext &rctx, const NDArray &input, template void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output); +/*! \brief returns true if one of storage types in `inputs` is the same as target `stype`. + */ +inline bool ContainsStorage(const std::vector& inputs, + NDArrayStorageType type) { + for (const auto &i : inputs) { + if (i.storage_type() == type) + return true; + } + return false; +} + +/*! \brief returns true if one of storage types in `vstorage` is the same as target `stype`. + */ +inline bool ContainsStorage(const std::vector &vstorages, + NDArrayStorageType type) { + for (const auto& i : vstorages) { + if (i == type) + return true; + } + return false; +} + /*! \brief returns true if all storage types in `vstorage` are the same as target `stype`. * false is returned for empty inputs. */ diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 73a8ae2f246b..ed12917594e2 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -25,6 +25,7 @@ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../../common/utils.h" namespace mxnet { namespace op { @@ -73,33 +74,6 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } -static inline bool ContainStorage(const std::vector &storages, - NDArrayStorageType type) { - for (const auto& i : storages) { - if (i == type) - return true; - } - return false; -} - -static inline bool ContainStorage(const std::vector& inputs, - NDArrayStorageType type) { - for (const auto &i : inputs) { - if (i.storage_type() == type) - return true; - } - return false; -} - -static inline bool ContainOnlyStorage(const std::vector& inputs, - NDArrayStorageType type) { - for (const auto &i : inputs) { - if (i.storage_type() != type) - return false; - } - return true; -} - bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, @@ -109,7 +83,7 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask - && ContainStorage(*in_attrs, kMKLDNNStorage)) { + && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -136,10 +110,10 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 - } else if (ContainStorage(inputs, kMKLDNNStorage)) { + } else if (common::ContainsStorage(inputs, kMKLDNNStorage)) { MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif - } else if (ContainOnlyStorage(inputs, kDefaultStorage)) { + } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { // This case happens when we want to create an MKLDNN NDArray but the type // or the shape isn't supported by MKLDNN. In this case, NDArray falls back // to the default storage type and, thus, we have to handle the default From 460a0a8609a8d271619d5af0513923d4c167e73e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 02:41:02 +0000 Subject: [PATCH 158/264] Limit MKLDNN concat being used. --- src/operator/nn/concat-inl.h | 16 +++++++--------- src/operator/nn/concat.cc | 8 ++++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h index dc6a6c04fb52..3bbc5cb2f982 100644 --- a/src/operator/nn/concat-inl.h +++ b/src/operator/nn/concat-inl.h @@ -93,27 +93,25 @@ class ConcatOp { Concatenate(data, &out, 1, req[concat_enum::kOut]); } - void Backward(const OpContext &ctx, - const std::vector &out_grad, + void Backward(const OpContext &ctx, const TBlob &out_grad, const std::vector &req, const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); CHECK_EQ(in_grad.size(), static_cast(size_)); Stream *s = ctx.get_stream(); std::vector > grad_in(size_); Tensor grad; size_t leading = 1, trailing = 1; for (int i = 0; i < dimension_; ++i) { - leading *= out_grad[concat_enum::kOut].shape_[i]; + leading *= out_grad.shape_[i]; } - for (int i = dimension_ + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) { - trailing *= out_grad[concat_enum::kOut].shape_[i]; + for (int i = dimension_ + 1; i < out_grad.ndim(); ++i) { + trailing *= out_grad.shape_[i]; } - size_t mid = out_grad[concat_enum::kOut].shape_[dimension_]; + size_t mid = out_grad.shape_[dimension_]; Shape<3> oshape = Shape3(leading, mid, trailing); - grad = out_grad[concat_enum::kOut].get_with_shape(oshape, s); + grad = out_grad.get_with_shape(oshape, s); for (int i = 0; i < size_; ++i) { Shape<3> dshape = Shape3(leading, in_grad[i].shape_[dimension_], trailing); @@ -149,7 +147,7 @@ void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, { ConcatOp op; op.Init(param); - op.Backward(ctx, inputs, req, outputs); + op.Backward(ctx, inputs[concat_enum::kOut], req, outputs); }); } diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index f660b335af0b..d524529e9038 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -26,6 +26,7 @@ #include "./concat-inl.h" #include "./mkldnn/mkldnn_ops-inl.h" +#include "../../common/utils.h" namespace mxnet { namespace op { @@ -112,7 +113,9 @@ inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + // There must be at least one array that are in MKLDNN format. + && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -130,7 +133,8 @@ inline static bool backward_ConcatStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { #if MXNET_USE_MKLDNN == 1 CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask + && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; From e1d00644853fe0de29ad25a21e9d762b9b59a70f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 9 Dec 2017 05:41:28 +0000 Subject: [PATCH 159/264] Add license. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 53ded72ac642..6d8efc955e08 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /******************************************************************************* * Copyright 2016-2017 Intel Corporation * From b8bfd7faa834d564375d46dff56fa47c6b4b5b55 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 11 Dec 2017 00:24:47 +0000 Subject: [PATCH 160/264] Fix amalgamation --- amalgamation/mxnet_predict0.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc index f35591d82b22..cfee60559501 100644 --- a/amalgamation/mxnet_predict0.cc +++ b/amalgamation/mxnet_predict0.cc @@ -66,7 +66,7 @@ #include "src/operator/operator_util.cc" #include "src/operator/nn/activation.cc" #include "src/operator/nn/batch_norm.cc" -#include "src/operator/concat.cc" +#include "src/operator/nn/concat.cc" #include "src/operator/nn/convolution.cc" #include "src/operator/nn/deconvolution.cc" #include "src/operator/nn/dropout.cc" From df63a79d754046f74b522e7f822039ca86609e06 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 11 Dec 2017 00:29:32 +0000 Subject: [PATCH 161/264] Fix compilation error in mkldnn_ops-inl.h --- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 4d2543dc6f25..8934e963d578 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -23,6 +23,11 @@ * \author Da Zheng */ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ + +#if MXNET_USE_MKLDNN == 1 + #include #include #include @@ -33,10 +38,6 @@ #include #include -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ - -#if MXNET_USE_MKLDNN == 1 namespace mxnet { namespace op { From 69342345d080f919a6eca3d47219b01bfa50152a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 11 Dec 2017 04:31:03 +0000 Subject: [PATCH 162/264] Fix a bug in deconvolution. --- src/operator/nn/deconvolution-inl.h | 2 +- src/operator/nn/deconvolution.cc | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 3dddee7daa46..f217779f6458 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -320,7 +320,7 @@ class DeconvolutionOp { // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(in_data.size(), expected); CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 6e826ce18b1d..e36d6d823d2a 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -290,11 +290,7 @@ inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; -#if MXNET_USE_CUDNN == 1 CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U); -#else - CHECK_EQ(in_attrs->size(), 3U); -#endif CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 @@ -386,11 +382,9 @@ struct DeconvolutionGrad { std::vector heads(ograds.begin(), ograds.end()); heads.push_back(n->inputs[deconv::kData]); heads.push_back(n->inputs[deconv::kWeight]); -#if MXNET_USE_CUDNN == 1 const DeconvolutionParam& param = nnvm::get(n->attrs.parsed); if (!param.no_bias) heads.push_back(n->inputs[deconv::kBias]); -#endif return MakeGradNode(op_name, n, heads, n->attrs.dict); } }; From 511ac71385dcde8f8c7f895ce950272f955e4406 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 11 Dec 2017 23:32:29 +0000 Subject: [PATCH 163/264] Fix a bug in pooling. --- src/operator/nn/pooling.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 19de29684cc7..aad01f025295 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -300,6 +300,8 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); + auto expected = MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; + CHECK_EQ(out_attrs->size(), expected); if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) // There is no reason to use MKLDNN pooling if the input isn't in // MKLDNN format. @@ -309,8 +311,9 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, (*out_attrs)[i] = kMKLDNNStorage; return true; } -#endif +#else CHECK_EQ(out_attrs->size(), 1); +#endif *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; From 9336bccfb9100d8e1d4e5191cd1a1cddcfc84a81 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 11 Dec 2017 23:29:03 +0000 Subject: [PATCH 164/264] MKLDNN ops allocates temp mem. --- src/ndarray/ndarray.cc | 13 +-- src/operator/nn/activation-inl.h | 1 + src/operator/nn/activation.cc | 5 + src/operator/nn/concat-inl.h | 1 + src/operator/nn/concat.cc | 10 ++ src/operator/nn/fully_connected-inl.h | 1 + src/operator/nn/fully_connected.cc | 10 ++ src/operator/nn/mkldnn/mkldnn_act-inl.h | 8 +- src/operator/nn/mkldnn/mkldnn_base-inl.h | 98 +++++++++++++++++-- src/operator/nn/mkldnn/mkldnn_concat.cc | 33 ++++--- src/operator/nn/mkldnn/mkldnn_convolution.cc | 2 + src/operator/nn/mkldnn/mkldnn_copy.cc | 5 +- .../nn/mkldnn/mkldnn_deconvolution.cc | 2 + .../nn/mkldnn/mkldnn_fully_connected.cc | 2 + src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 1 + src/operator/nn/pooling.cc | 69 +++++++------ .../tensor/elemwise_binary_op_basic.cc | 5 + .../tensor/elemwise_unary_op_basic.cc | 5 + src/operator/tensor/matrix_op.cc | 5 + 19 files changed, 205 insertions(+), 71 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a467a9b06578..f3137f453512 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -500,13 +500,6 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { } } -static int GetTypeSize(int dtype) { - MSHADOW_TYPE_SWITCH(dtype, DType, { - return sizeof(DType); - }); - return -1; -} - std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const { if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { @@ -566,8 +559,7 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( stream.RegisterMem(ret); return ret; } else { - // TODO(zhengda) we should manage the memory allocation here. - mkldnn_mem_ptr ret(new mkldnn::memory(desc)); + mkldnn_mem_ptr ret = TmpMemMgr::Instance().Alloc(desc); stream.RegisterMem(ret); stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; @@ -649,7 +641,8 @@ std::shared_ptr NDArray::CreateMKLDNNData( return ptr_->Mkl_mem_; } - ptr_->Mkl_mem_ = CreateMKLDNNMem(desc); + ptr_->Mkl_mem_ = mkldnn_mem_ptr(new mkldnn::memory(desc)); + MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_; } #endif diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index f297d92cf598..9b82c83ca0e8 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -46,6 +46,7 @@ namespace op { namespace activation { enum ActivationOpInputs {kData}; enum ActivationOpOutputs {kOut}; +enum ActivationOpResource {kTempSpace}; enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU}; } // activation diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index a9e438c6c1cf..70fe88bc8753 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -170,6 +170,11 @@ NNVM_REGISTER_OP(_backward_Activation) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{0, 0}}; }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr_parser(ParamParser) .set_attr("FCompute", ActivationGradCompute) .set_attr("FComputeEx", ActivationGradComputeEx_CPU); diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h index 3bbc5cb2f982..f4c9104e181a 100644 --- a/src/operator/nn/concat-inl.h +++ b/src/operator/nn/concat-inl.h @@ -41,6 +41,7 @@ namespace op { namespace concat_enum { enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4}; +enum ConcatOpResource {kTempSpace}; enum ConcatOpOutputs {kOut}; } // namespace concat_enum diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index d524529e9038..7de5706bf956 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -258,6 +258,11 @@ Example:: } return ret; }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInferShape", ConcatShape) .set_attr("FInferType", ConcatType) .set_attr("FInferStorageType", ConcatForwardInferStorageType) @@ -276,6 +281,11 @@ NNVM_REGISTER_OP(_backward_Concat) return params.num_args; }) .set_attr_parser(ParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("TIsBackward", true) .set_attr("FInferStorageType", backward_ConcatStorageType) .set_attr("FCompute", ConcatGradCompute) diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 4646d3a5e199..e8e95643e647 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -43,6 +43,7 @@ namespace op { // These enums are only visible within this header namespace fullc { enum FullyConnectedOpInputs {kData, kWeight, kBias}; +enum FullyConnectedOpResource {kTempSpace}; enum FullyConnectedOpOutputs {kOut}; } // fullc diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 57b4bdf3e30a..4459a9bf505b 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -218,6 +218,11 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. return std::vector{"data", "weight"}; } }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInferShape", FullyConnectedShape) .set_attr("FInferType", FullyConnectedType) .set_attr("FCompute", FullyConnectedCompute) @@ -234,6 +239,11 @@ NNVM_REGISTER_OP(_backward_FullyConnected) const FullyConnectedParam& params = nnvm::get(attrs.parsed); return params.no_bias ? 2 : 3; }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("TIsBackward", true) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{1, 0}}; diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index be625b87e3d2..7339de81171b 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -71,7 +71,8 @@ static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { template void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, - const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { std::shared_ptr input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); @@ -95,8 +96,8 @@ void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, template void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, - const NDArray &out_grad, const NDArray &in_data, const OpReqType &req, - const NDArray &in_grad) { + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad) { if (req == kNullOp) { return; } @@ -108,6 +109,7 @@ void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; + TmpMemMgr::Instance().Init(ctx.requested[activation::kTempSpace]); auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 6d8efc955e08..7e78ddff859e 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -39,6 +39,7 @@ * deepthi.karkada@intel.com * louis.feng@intel.com * adam.d.straw@intel.com +* zhengda1936@gmail.com * *******************************************************************************/ @@ -129,6 +130,17 @@ static inline bool SupportMKLDNNConv(const NDArray &input) { return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4; } +static int GetTypeSize(int dtype) { + MSHADOW_TYPE_SWITCH(dtype, DType, { + return sizeof(DType); + }); + return -1; +} + +static inline size_t GetArraySize(const NDArray &arr) { + return arr.shape().Size() * GetTypeSize(arr.dtype()); +} + static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { switch (dtype) { case mshadow::kFloat32: @@ -175,6 +187,67 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, typedef std::shared_ptr mkldnn_mem_ptr; typedef std::shared_ptr mkldnn_mem_const_ptr; +class TmpMemMgr { + // This points to the memory buffer where we can allocate temp memory. + char *curr_mem; + // The total size of the temp memory. + size_t mem_size; + // This contains the current available memory size. + size_t curr_size; + // This estimate the required temp memory size in an operator. + size_t est_size; + public: + static TmpMemMgr &Instance() { + static thread_local TmpMemMgr mgr; + return mgr; + } + + TmpMemMgr() { + Reset(); + est_size = 0; + mem_size = 0; + } + + void Reset() { + curr_mem = nullptr; + curr_size = 0; + // We don't reset est_size and mem_size because est_size contains the + // estimated temp memory size from the last run and mem_size contains the + // memroy size allocated in the last run. + } + + void Init(const Resource &r) { + // If the last time, if we estimate that we need more memory, we should the + // larger memory size. + mem_size = std::max(mem_size, est_size); + if (mem_size > 0) { + // Let's allocate some extra memory. If we don't use some of them all the time, + // the OS won't physically allocate pages for them any way. + this->curr_mem = static_cast(r.get_host_space_internal(mem_size * 2)); + this->curr_size = mem_size * 2; + } + // reset est_size, so we can start to estimate the temp memory size. + this->est_size = 0; + } + + mkldnn_mem_ptr Alloc(const mkldnn::memory::primitive_desc &pd) { + this->est_size += pd.get_size(); + if (pd.get_size() <= this->curr_size) { + // The memory is allocated from the temporary memory space in the + // operator. It'll only become invalid after we exit from the operator. + // TODO I need to make sure memory allocated here is aligned. + mkldnn_mem_ptr ret(new mkldnn::memory(pd, this->curr_mem)); + this->curr_size -= pd.get_size(); + this->curr_mem += pd.get_size(); + return ret; + } else { + LOG(WARNING) << "Allocate " << pd.get_size() + << " bytes with malloc directly"; + return mkldnn_mem_ptr(new mkldnn::memory(pd)); + } + } +}; + class MKLDNNStream { std::vector net; // Here we hold all memory related to the operators in the stream. @@ -195,13 +268,13 @@ class MKLDNNStream { mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); net.clear(); mem_holder.clear(); + TmpMemMgr::Instance().Reset(); } }; -inline static mkldnn_mem_ptr CreateMKLDNNMem( +inline static mkldnn_mem_ptr CreateMKLDNNTempMem( const mkldnn::memory::primitive_desc &desc) { - // TODO(zhengda) allocate memory more efficiently. - std::shared_ptr ret(new mkldnn::memory(desc)); + mkldnn_mem_ptr ret = TmpMemMgr::Instance().Alloc(desc); MKLDNNStream::Instance().RegisterMem(ret); return ret; } @@ -218,13 +291,18 @@ static inline mkldnn_output_t CreateMKLDNNMem( const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req) { if (kAddTo == req) { - return mkldnn_output_t(OutDataOp::AddBack, CreateMKLDNNMem(desc)); + auto tmp = TmpMemMgr::Instance().Alloc(desc); + MKLDNNStream::Instance().RegisterMem(tmp); + return mkldnn_output_t(OutDataOp::AddBack, tmp); } else { mkldnn_mem_ptr mem = const_cast(arr).CreateMKLDNNData(desc); - if (mem == nullptr) - return mkldnn_output_t(OutDataOp::CopyBack, CreateMKLDNNMem(desc)); - else + if (mem == nullptr) { + auto tmp = TmpMemMgr::Instance().Alloc(desc); + MKLDNNStream::Instance().RegisterMem(tmp); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { return mkldnn_output_t(OutDataOp::Noop, mem); + } } } @@ -242,8 +320,8 @@ static inline void CommitOutput(const NDArray &arr, arr.GetMKLDNNData(res.second->get_primitive_desc()); CHECK(mem != nullptr); // We have to allocate new memory for the sum result. - mkldnn_mem_ptr sum_res( - new mkldnn::memory(res.second->get_primitive_desc())); + mkldnn_mem_ptr sum_res = TmpMemMgr::Instance().Alloc( + res.second->get_primitive_desc()); MKLDNNStream::Instance().RegisterMem(sum_res); op::Sum(*res.second, *mem, *sum_res); const_cast(arr).CopyFrom(*sum_res); @@ -290,7 +368,7 @@ inline static mkldnn_mem_const_ptr GetWeights( } if (mem->get_primitive_desc() == target_pd) return mem; - std::shared_ptr ret = CreateMKLDNNMem(target_pd); + std::shared_ptr ret = CreateMKLDNNTempMem(target_pd); MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); return ret; } diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index 8171784d56cf..867311d040b0 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -33,6 +33,7 @@ namespace op { void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { + TmpMemMgr::Instance().Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; int concat_dim = param.dim; @@ -56,6 +57,7 @@ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + TmpMemMgr::Instance().Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; int axis_ = param.dim; @@ -65,20 +67,23 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, /* init the offset */ mkldnn::memory::dims offsets = {0, 0, 0, 0}; for (int i = 0; i < num_in_data; i++) { - mkldnn::memory::dims diff_src_tz = {inputs[i+1].shape()[0], inputs[i+1].shape()[1], - inputs[i+1].shape()[2], inputs[i+1].shape()[3]}; - auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); - auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); - // create view from gy to gxs[i] - std::shared_ptr view_pd; - view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); - // create reorder primitive from gy to gxs[i] - mkldnn::reorder::primitive_desc reorder_pd( - view_pd.get()->dst_primitive_desc(), diff_src_mpd); - offsets[axis_] += diff_src_tz[axis_]; - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder( - reorder_pd, *gz_mem, *gradi_mem_.second)); - CommitOutput(outputs[i], gradi_mem_); + mkldnn::memory::dims diff_src_tz + = {static_cast(inputs[i+1].shape()[0]), + static_cast(inputs[i+1].shape()[1]), + static_cast(inputs[i+1].shape()[2]), + static_cast(inputs[i+1].shape()[3])}; + auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); + auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); + // create view from gy to gxs[i] + std::shared_ptr view_pd; + view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); + // create reorder primitive from gy to gxs[i] + mkldnn::reorder::primitive_desc reorder_pd( + view_pd.get()->dst_primitive_desc(), diff_src_mpd); + offsets[axis_] += diff_src_tz[axis_]; + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder( + reorder_pd, *gz_mem, *gradi_mem_.second)); + CommitOutput(outputs[i], gradi_mem_); } MKLDNNStream::Instance().Submit(); } diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index a6e756fe9499..56d4bd557bcb 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -169,6 +169,7 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector &in_data, const std::vector &req, const std::vector &out_data) { + TmpMemMgr::Instance().Init(ctx.requested[conv::kTempSpace]); const ConvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], @@ -195,6 +196,7 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + TmpMemMgr::Instance().Init(ctx.requested[conv::kTempSpace]); const std::vector &in_grad = outputs; auto engine = CpuEngine::Instance().get_engine(); const ConvolutionParam& param = nnvm::get(attrs.parsed); diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc index aa141f1e3e09..d15fd39ddf7a 100644 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -36,13 +36,14 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &out_data) { auto in_mem = in_data.GetMKLDNNData(); if (req == kAddTo) { + TmpMemMgr::Instance().Init(ctx.requested[0]); // We should try and force the output memory has the same format // as the input memory. If not, we'll have to reorder memory. auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); if (out_mem == nullptr) out_mem = out_data.GetMKLDNNData(); - mkldnn_mem_ptr sum_res( - new mkldnn::memory(out_mem->get_primitive_desc())); + mkldnn_mem_ptr sum_res + = TmpMemMgr::Instance().Alloc(out_mem->get_primitive_desc()); MKLDNNStream::Instance().RegisterMem(sum_res); Sum(*in_mem, *out_mem, *sum_res); const_cast(out_data).CopyFrom(*sum_res); diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index cca73e3d9445..251f341d32da 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -154,6 +154,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { + TmpMemMgr::Instance().Init(ctx.requested[deconv::kTempSpace]); const DeconvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( @@ -187,6 +188,7 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + TmpMemMgr::Instance().Init(ctx.requested[deconv::kTempSpace]); const std::vector &in_grad = outputs; const DeconvolutionParam& param = nnvm::get(attrs.parsed); CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index f3ebb055d1b4..d79a7b6203eb 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -83,6 +83,7 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { + TmpMemMgr::Instance().Init(ctx.requested[fullc::kTempSpace]); const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = in_data[fullc::kData].shape(); const TShape& oshape = out_data[fullc::kOut].shape(); @@ -125,6 +126,7 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { + TmpMemMgr::Instance().Init(ctx.requested[fullc::kTempSpace]); const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = inputs[fullc::kData + 1].shape(); diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 06def9c28dc4..fed4075b322c 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -160,6 +160,7 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, return; } + TmpMemMgr::Instance().Init(ctx.requested[0]); std::shared_ptr diff_dst_mem = out_grad.GetMKLDNNData(); std::shared_ptr input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index aad01f025295..797edd960f93 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -387,48 +387,53 @@ For 3-D pooling, an additional *depth* dimension is added before height, width)*. )code" ADD_FILELINE) - .set_num_inputs(1) - .set_num_outputs([](const NodeAttrs& attrs) { +.set_num_inputs(1) +.set_num_outputs([](const NodeAttrs& attrs) { #if MXNET_USE_MKLDNN == 1 - const PoolingParam ¶m = nnvm::get(attrs.parsed); - return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; + const PoolingParam ¶m = nnvm::get(attrs.parsed); + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; #else - return 1; + return 1; #endif - }) +}) #if MXNET_USE_MKLDNN == 1 - .set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { return 1; }) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { return 1; }) #endif - .set_attr_parser(PoolingParamParser) - .set_attr("FInferStorageType", PoolingStorageType) - .set_attr("FInferType", PoolingType) - .set_attr("FInferShape", PoolingShape) - .set_attr("FCompute", PoolingCompute) - .set_attr("FComputeEx", PoolingCompute_CPU) - .set_attr("FGradient", - ElemwiseGradUseInOut{"_backward_Pooling"}) - .add_argument("data", "NDArray-or-Symbol", - "Input data to the pooling operator.") - .add_arguments(PoolingParam::__FIELDS__()); +.set_attr_parser(PoolingParamParser) +.set_attr("FInferStorageType", PoolingStorageType) +.set_attr("FInferType", PoolingType) +.set_attr("FInferShape", PoolingShape) +.set_attr("FCompute", PoolingCompute) +.set_attr("FComputeEx", PoolingCompute_CPU) +.set_attr("FGradient", + ElemwiseGradUseInOut{"_backward_Pooling"}) +.add_argument("data", "NDArray-or-Symbol", + "Input data to the pooling operator.") +.add_arguments(PoolingParam::__FIELDS__()); NNVM_REGISTER_OP(_backward_Pooling) - .set_num_outputs(1) - .set_attr("TIsBackward", true) - .set_attr( - "FInplaceOption", - [](const NodeAttrs &attrs) { +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr( + "FInplaceOption", + [](const NodeAttrs &attrs) { #if MXNET_USE_CUDNN == 1 - return std::vector >(); + return std::vector >(); #else - return std::vector >{{1, 0}}; + return std::vector >{{1, 0}}; +#endif +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) #endif - }) - .set_attr("FInferStorageType", - backward_PoolingStorageType) - .set_attr_parser(PoolingParamParser) - .set_attr("FCompute", PoolingGradCompute) - .set_attr("FComputeEx", PoolingGradCompute_CPU); +.set_attr("FInferStorageType", + backward_PoolingStorageType) +.set_attr_parser(PoolingParamParser) +.set_attr("FCompute", PoolingGradCompute) +.set_attr("FComputeEx", PoolingGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 4a3985d9cdbb..1c5ff0ec91d5 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -149,6 +149,11 @@ NNVM_REGISTER_OP(_backward_add) return std::vector >{{0, 0}, {0, 1}}; }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< cpu, mshadow_op::identity, mshadow_op::identity>) .set_attr("FComputeEx", _backward_ElemwiseAddEx) diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 24505d438c5e..3590c088b66d 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -161,6 +161,11 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy) .set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) .set_attr("FComputeEx", CopyEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index a7318ca78d1b..9ce0a7273568 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -201,6 +201,11 @@ Example:: .set_attr("FGradient", ElemwiseGradUseNone{ "_backward_copy" }) .set_attr("FCompute", UnaryOp::IdentityCompute) .set_attr("FComputeEx", FlattenEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; From a8d41386574cca3f1d6c6e165fc1c5b1ebf76164 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 01:29:41 +0000 Subject: [PATCH 165/264] Fix a bug in pooling. --- src/operator/nn/pooling-inl.h | 31 +++++++++++++++++----- src/operator/nn/pooling.cc | 49 +++++++++++++++++++---------------- src/operator/nn/pooling.cu | 27 +++++++++++++------ 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index 3f511dfaacd9..de98bc4981af 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -80,6 +80,13 @@ struct PoolingParam : public dmlc::Parameter { } }; +/* + * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which + * also changes the number of inputs for backward. + */ +int GetNumOutputs(const PoolingParam ¶m); +int GetNumBackInputs(const PoolingParam ¶m); + template void PoolingForward(const OpContext& ctx, const PoolingParam ¶m, const TBlob& in_data, const OpReqType& req, @@ -122,9 +129,9 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), GetNumOutputs(param)); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type @@ -142,16 +149,28 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - CHECK_EQ(inputs.size(), 3U); + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), GetNumBackInputs(param)); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); - const PoolingParam& param = nnvm::get(attrs.parsed); + off_t ograd_idx, in_data_idx, out_data_idx; + // When MKLDNN is enabled, the input data may contains arrays for workspace. + if (GetNumBackInputs(param) == 5) { + ograd_idx = 0; + in_data_idx = 2; + out_data_idx = 3; + } else { + ograd_idx = 0; + in_data_idx = 1; + out_data_idx = 2; + } MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - PoolingBackward(ctx, param, - inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + PoolingBackward(ctx, param, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], + req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 797edd960f93..5351110f198a 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -63,6 +63,22 @@ static void PoolingParamParser(nnvm::NodeAttrs *attrs) { attrs->parsed = std::move(param_); } +int GetNumOutputs(const PoolingParam ¶m) { +#if MXNET_USE_MKLDNN == 1 + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; +#else + return 1; +#endif +} + +int GetNumBackInputs(const PoolingParam ¶m) { +#if MXNET_USE_MKLDNN == 1 + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3; +#else + return 3; +#endif +} + static bool PoolingType(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { @@ -225,8 +241,7 @@ void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, #endif std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); - // We know pooling has only one output. - std::vector out_blobs(1); + std::vector out_blobs(outputs.size()); for (size_t i = 0; i < out_blobs.size(); i++) out_blobs[i] = outputs[i].data(); PoolingCompute(attrs, ctx, in_blobs, req, out_blobs); @@ -260,18 +275,9 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, return; } #endif - std::vector in_blobs(3); - // In this case, there isn't workspace in the input arrays. - if (inputs.size() == 3) { - for (size_t i = 0; i < in_blobs.size(); i++) - in_blobs[i] = inputs[i].data(); - } else { - // There is workspace among the input arrays. One for out_grad and one for - // input. - in_blobs[0] = inputs[0].data(); // out grad - in_blobs[1] = inputs[2].data(); // in data - in_blobs[2] = inputs[3].data(); // out data - } + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); std::vector out_blobs(outputs.size()); for (size_t i = 0; i < out_blobs.size(); i++) out_blobs[i] = outputs[i].data(); @@ -315,7 +321,8 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(out_attrs->size(), 1); #endif *dispatch_mode = DispatchMode::kFCompute; - (*out_attrs)[0] = kDefaultStorage; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; return true; } @@ -324,10 +331,11 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, DispatchMode *dispatch_mode, std::vector *in_attrs, std::vector *out_attrs) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), GetNumBackInputs(param)); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - const PoolingParam ¶m = nnvm::get(attrs.parsed); if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) // There is no reason to use MKLDNN pooling if the input isn't in // MKLDNN format. @@ -337,8 +345,9 @@ inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, (*out_attrs)[i] = kMKLDNNStorage; return true; } -#endif +#else CHECK_EQ(in_attrs->size(), 3); +#endif *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; @@ -389,12 +398,8 @@ height, width)*. )code" ADD_FILELINE) .set_num_inputs(1) .set_num_outputs([](const NodeAttrs& attrs) { -#if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; -#else - return 1; -#endif + return GetNumOutputs(param); }) #if MXNET_USE_MKLDNN == 1 .set_attr("FNumVisibleOutputs", diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index de7dbf12606d..c3bcecfc77b7 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -51,9 +51,9 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), GetNumOutputs(param)); #if MXNET_USE_CUDNN == 1 if (!param.cudnn_off && param.kernel.ndim() > 1) { @@ -88,10 +88,21 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - CHECK_EQ(inputs.size(), 3U); + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), GetNumBackInputs(param)); CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); - const PoolingParam& param = nnvm::get(attrs.parsed); + off_t ograd_idx, in_data_idx, out_data_idx; + // When MKLDNN is enabled, the input data may contains arrays for workspace. + if (GetNumBackInputs(param) == 5) { + ograd_idx = 0; + in_data_idx = 2; + out_data_idx = 3; + } else { + ograd_idx = 0; + in_data_idx = 1; + out_data_idx = 2; + } #if MXNET_USE_CUDNN == 1 if (!param.cudnn_off && param.kernel.ndim() > 1) { @@ -99,8 +110,8 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, switch (param.pool_type) { case pool_enum::kMaxPooling: case pool_enum::kAvgPooling: - GetCuDNNPoolingOp(param).Backward(ctx, - inputs[0], inputs[1], inputs[2], req[0], outputs[0]); + GetCuDNNPoolingOp(param).Backward(ctx, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); return; case pool_enum::kSumPooling: LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; @@ -114,8 +125,8 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs, if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - PoolingBackward(ctx, param, inputs[0], - inputs[1], inputs[2], req[0], outputs[0]); + PoolingBackward(ctx, param, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } From bbea66eeebc3f45be0d3e29a1e7c081cca086a89 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 06:39:19 +0000 Subject: [PATCH 166/264] Allocate align memory from temp space. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 7e78ddff859e..dafe6c69c594 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -52,6 +52,8 @@ #include #include #include +#include +#include #include "mkldnn.hpp" using namespace mkldnn; namespace mxnet { @@ -196,6 +198,8 @@ class TmpMemMgr { size_t curr_size; // This estimate the required temp memory size in an operator. size_t est_size; + const size_t alignment = 4096; + public: static TmpMemMgr &Instance() { static thread_local TmpMemMgr mgr; @@ -223,22 +227,25 @@ class TmpMemMgr { if (mem_size > 0) { // Let's allocate some extra memory. If we don't use some of them all the time, // the OS won't physically allocate pages for them any way. - this->curr_mem = static_cast(r.get_host_space_internal(mem_size * 2)); this->curr_size = mem_size * 2; + this->curr_mem = static_cast(r.get_host_space_internal(this->curr_size)); } // reset est_size, so we can start to estimate the temp memory size. this->est_size = 0; } mkldnn_mem_ptr Alloc(const mkldnn::memory::primitive_desc &pd) { - this->est_size += pd.get_size(); - if (pd.get_size() <= this->curr_size) { + // We need to include the size of the memory used for alignment. + this->est_size += pd.get_size() + alignment; + void *this_mem = this->curr_mem; + void *mem = std::align(alignment, pd.get_size(), this_mem, this->curr_size); + if (mem) { // The memory is allocated from the temporary memory space in the // operator. It'll only become invalid after we exit from the operator. - // TODO I need to make sure memory allocated here is aligned. - mkldnn_mem_ptr ret(new mkldnn::memory(pd, this->curr_mem)); + mkldnn_mem_ptr ret(new mkldnn::memory(pd, this_mem)); + CHECK_EQ(this_mem, mem); this->curr_size -= pd.get_size(); - this->curr_mem += pd.get_size(); + this->curr_mem = static_cast(this_mem) + pd.get_size(); return ret; } else { LOG(WARNING) << "Allocate " << pd.get_size() From 84e60cd6d6137e997a161797135e100f4d7cce76 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 07:50:19 +0000 Subject: [PATCH 167/264] Have parameter gradients stored in the default storage. --- src/operator/nn/convolution.cc | 9 ++++++++- src/operator/nn/deconvolution.cc | 10 ++++++++-- src/operator/nn/fully_connected.cc | 10 ++++++++-- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 4b7e0dac337f..e354a4536103 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -338,8 +338,15 @@ inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs, && (in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; + (*out_attrs)[conv::kData] = kMKLDNNStorage; + // We don't want the parameter gradients are stored in MKLDNN storage. + // These will be sent to the KVstore to update the global parameters. + // We should convert storage inside an operator so that we can take + // advantage of TempSpace. + (*out_attrs)[conv::kWeight] = kDefaultStorage; + if (!param.no_bias) + (*out_attrs)[conv::kBias] = kDefaultStorage; for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; return true; } #endif diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index e36d6d823d2a..ac241717d1d9 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -301,8 +301,14 @@ inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, && (in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(0) == kDefaultStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; + (*out_attrs)[deconv::kData] = kMKLDNNStorage; + // We don't want the parameter gradients are stored in MKLDNN storage. + // These will be sent to the KVstore to update the global parameters. + // We should convert storage inside an operator so that we can take + // advantage of TempSpace. + (*out_attrs)[deconv::kWeight] = kDefaultStorage; + if (!param.no_bias) + (*out_attrs)[deconv::kBias] = kDefaultStorage; return true; } #endif diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 4459a9bf505b..805ffbea2ba0 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -168,8 +168,14 @@ inline static bool backward_FCStorageType(const nnvm::NodeAttrs& attrs, // overhead as well. if (dev_mask == mshadow::cpu::kDevMask && in_attrs->at(0) == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; + (*out_attrs)[fullc::kData] = kMKLDNNStorage; + // We don't want the parameter gradients are stored in MKLDNN storage. + // These will be sent to the KVstore to update the global parameters. + // We should convert storage inside an operator so that we can take + // advantage of TempSpace. + (*out_attrs)[fullc::kWeight] = kDefaultStorage; + if (!param.no_bias) + (*out_attrs)[fullc::kBias] = kDefaultStorage; return true; } #endif From ad6c35de1485953259d3bf08f14706f3d2784b6f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 07:51:53 +0000 Subject: [PATCH 168/264] Handle all cases in CopyFrom. --- src/ndarray/ndarray.cc | 45 +++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f3137f453512..a9dd3f0a4e28 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -222,13 +222,8 @@ static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) } } -static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, - bool submit_now = true) { - auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); - if (format == mem->get_primitive_desc().desc().data.format) - return mem; - - auto pd = mem->get_primitive_desc(); +static inline mkldnn::memory::primitive_desc GetPrimitiveDesc( + mkldnn::memory::primitive_desc pd, mkldnn_memory_format_t format) { mkldnn::memory::dims dims(pd.desc().data.ndims); for (size_t i = 0; i < dims.size(); i++) dims[i] = pd.desc().data.dims[i]; @@ -236,9 +231,17 @@ static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, mkldnn::memory::data_type cpp_type = static_cast( pd.desc().data.data_type); mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - mkldnn_mem_ptr def_mem(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, - pd.get_engine()))); + return mkldnn::memory::primitive_desc(data_md, pd.get_engine()); +} +static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, + bool submit_now = true) { + auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); + if (format == mem->get_primitive_desc().desc().data.format) + return mem; + + auto def_pd = GetPrimitiveDesc(mem->get_primitive_desc(), format); + mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterMem(mem); stream.RegisterMem(def_mem); @@ -595,12 +598,13 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { ptr_->SetMKLMem(shape_, dtype_); auto from_desc = mem.get_primitive_desc().desc(); auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto from_def_format = GetDefaultFormat(from_desc); // It's possible that the memory and the NDArray don't have the same shape. - if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { + if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims) + // If the source memory uses the default layout, we can reshape directly. + && from_def_format == from_desc.data.format) { // In this case, we can simply create a new MKLDNN memory for the required // shape. - // TODO(zhengda) let's just hope it's the default format for now. - CHECK_EQ(GetDefaultFormat(from_desc), from_desc.data.format); mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims); auto this_dtype = static_cast(this_desc.data.data_type); @@ -610,6 +614,23 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); stream.RegisterMem(tmp_mem); stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); + } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { + // In this case, the source memory stores data in a customized layout. We + // need to reorganize the data in memory before we can reshape. + auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format); + mkldnn_mem_ptr def_mem = CreateMKLDNNTempMem(def_pd); + MKLDNNStream &stream = MKLDNNStream::Instance(); + stream.RegisterPrim(mkldnn::reorder(mem, *def_mem)); + // Now we can reshape it + mkldnn::memory::dims dims(this_desc.data.dims, + this_desc.data.dims + this_desc.data.ndims); + auto this_dtype = static_cast(this_desc.data.data_type); + auto this_format = static_cast(GetDefaultFormat(this_desc)); + mkldnn::memory::desc data_md(dims, this_dtype, this_format); + mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle())); + stream.RegisterMem(tmp_mem); + stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); } else { stream.RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); } From be0d5b9feddf01c5bbbae5274a20e1cb25ed78ec Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 19:31:29 +0000 Subject: [PATCH 169/264] Ensure NDArray returns memory with right memory descriptors. --- src/ndarray/ndarray.cc | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a9dd3f0a4e28..087ea881e575 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -503,6 +503,24 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { } } +/* + * Here we want to get MKLDNN memory whose primitive desc is exactly the same as + * the given one. operator== can't guarantee that. == can return true even if + * the formats are different. I need to double check its format. + */ +static inline std::shared_ptr GetMKLDNNExact( + std::shared_ptr mem, mkldnn::memory::primitive_desc desc) { + auto src_desc = mem->get_primitive_desc(); + if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) { + return mem; + } else { + std::shared_ptr ret(new mkldnn::memory( + desc, mem->get_data_handle())); + MKLDNNStream::Instance().RegisterMem(ret); + return ret; + } +} + std::shared_ptr NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const { if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { @@ -522,9 +540,7 @@ std::shared_ptr NDArray::GetMKLDNNData( || (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2))) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); - MKLDNNStream::Instance().RegisterMem(ret); - return ret; + return GetMKLDNNExact(ptr_->Mkl_mem_, desc); } else { return nullptr; } @@ -548,7 +564,7 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( // We need to make sure Mkl_mem_ is always valid as well. stream.RegisterMem(ptr_->Mkl_mem_); if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { - return ptr_->Mkl_mem_; + return GetMKLDNNExact(ptr_->Mkl_mem_, desc); } mkldnn::memory::primitive_desc _desc = desc; @@ -654,12 +670,12 @@ std::shared_ptr NDArray::CreateMKLDNNData( if (required_format == def_format) { ptr_->SetMKLMem(shape_, dtype_); MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_; + return GetMKLDNNExact(ptr_->Mkl_mem_, desc); } if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_; + return GetMKLDNNExact(ptr_->Mkl_mem_, desc); } ptr_->Mkl_mem_ = mkldnn_mem_ptr(new mkldnn::memory(desc)); From e186ee3071ed51d3a06bf1ed8f820fa6f344f76f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 20:30:50 +0000 Subject: [PATCH 170/264] use auto to define memory in the operator. --- src/operator/nn/mkldnn/mkldnn_act-inl.h | 10 +++++----- src/operator/nn/mkldnn/mkldnn_concat.cc | 4 ++-- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 17 +++++++---------- src/operator/nn/mkldnn/mkldnn_softmax.cc | 4 ++-- src/operator/nn/mkldnn/mkldnn_sum.cc | 11 +++++------ 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index 7339de81171b..e2f9989b8687 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -73,7 +73,7 @@ template void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { - std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); auto cpu_engine = data_mpd.get_engine(); @@ -87,8 +87,8 @@ void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, alg, data_md, alpha); mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); - std::shared_ptr output_memory - = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); + auto output_memory = const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); stream.Submit(); @@ -102,8 +102,8 @@ void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, return; } - std::shared_ptr diff_dst_memory = out_grad.GetMKLDNNData(); - std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto diff_dst_memory = out_grad.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index 867311d040b0..304e01ad845b 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -40,7 +40,7 @@ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, std::vector data_md; std::vector data_mem; for (int i =0; i < num_in_data; i++) { - std::shared_ptr tmp_mem = in_data[i].GetMKLDNNData(); + auto tmp_mem = in_data[i].GetMKLDNNData(); auto tmp_pd = tmp_mem->get_primitive_desc(); data_md.push_back(tmp_pd); data_mem.push_back(*tmp_mem); @@ -62,7 +62,7 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, int num_in_data = param.num_args; int axis_ = param.dim; auto engine = CpuEngine::Instance().get_engine(); - std::shared_ptrgz_mem = inputs[0].GetMKLDNNData(); + auto gz_mem = inputs[0].GetMKLDNNData(); mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); /* init the offset */ mkldnn::memory::dims offsets = {0, 0, 0, 0}; diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index fed4075b322c..cc6455efd1eb 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -122,7 +122,7 @@ inline bool MKLDNNRequireWorkspace(const PoolingParam ¶m) { void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, const NDArray &in_data, const OpReqType &req, const NDArray &out_data, const NDArray *workspace) { - std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); auto data_mpd = input_mem->get_primitive_desc(); auto data_md = data_mpd.desc(); @@ -135,14 +135,12 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, auto pdesc = GetPoolingFwd(param, ctx.is_train, data_md, out_md); - std::shared_ptr output_memory = - const_cast(out_data).CreateMKLDNNData( - pdesc.dst_primitive_desc()); - std::shared_ptr workspace_mem; + auto output_memory = const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); if (ctx.is_train && MKLDNNRequireWorkspace(param)) { CHECK(workspace != nullptr); - workspace_mem = workspace->GetMKLDNNData(); + auto workspace_mem = workspace->GetMKLDNNData(); MKLDNNStream::Instance().RegisterPrim( pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem)); } else { @@ -161,8 +159,8 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, } TmpMemMgr::Instance().Init(ctx.requested[0]); - std::shared_ptr diff_dst_mem = out_grad.GetMKLDNNData(); - std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto diff_dst_mem = out_grad.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], @@ -205,11 +203,10 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, auto diff_src_mem = CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req); - std::shared_ptr workspace_mem; if (MKLDNNRequireWorkspace(param)) { CHECK(workspace != nullptr); - workspace_mem = workspace->GetMKLDNNData(); + auto workspace_mem = workspace->GetMKLDNNData(); MKLDNNStream::Instance().RegisterPrim( pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), *diff_src_mem.second)); diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc index f5eff39986d7..d87e0fd2c8fe 100644 --- a/src/operator/nn/mkldnn/mkldnn_softmax.cc +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -34,7 +34,7 @@ namespace op { void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { const SoftmaxParam& param = nnvm::get(attrs.parsed); - std::shared_ptr input_mem = in_data.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); auto cpu_engine = data_mpd.get_engine(); @@ -44,7 +44,7 @@ void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, data_md, param.axis); mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine); - std::shared_ptr output_memory = out_data.GetMKLDNNData(); + auto output_memory = out_data.GetMKLDNNData(); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); stream.Submit(); diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index 9f5c5a319c60..275640888adf 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -50,20 +50,19 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const OpReqType &req, const NDArray &out_data) { - std::vector in_mems(inputs.size()); std::vector in_prims; std::vector in_pds(inputs.size()); std::vector scales(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { - in_mems[i] = inputs[i].GetMKLDNNData(); - in_prims.push_back(*in_mems[i]); - in_pds[i] = in_mems[i]->get_primitive_desc(); + auto in_mem = inputs[i].GetMKLDNNData(); + in_prims.push_back(*in_mem); + in_pds[i] = in_mem->get_primitive_desc(); scales[i] = 1; } mkldnn::sum::primitive_desc pdesc(scales, in_pds); - std::shared_ptr output_memory - = const_cast(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc()); + auto output_memory = const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterPrim(mkldnn::sum(pdesc, in_prims, *output_memory)); stream.Submit(); From b3d9c1634a8ab300e030388e8b0eeb4310a8152c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 20:33:11 +0000 Subject: [PATCH 171/264] Use raw pointer for mkldnn memory. --- include/mxnet/ndarray.h | 15 ++++-- src/ndarray/ndarray.cc | 35 +++++++------ src/operator/nn/mkldnn/mkldnn_base-inl.h | 64 ++++++++---------------- src/operator/nn/mkldnn/mkldnn_base.cc | 47 +++++++++++++++++ src/operator/nn/mkldnn/mkldnn_copy.cc | 4 +- 5 files changed, 96 insertions(+), 69 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_base.cc diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index d846e4cba38c..e5e9d753ab61 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -551,26 +551,33 @@ class NDArray { } #if MXNET_USE_MKLDNN == 1 + /* + * All functions below return a raw pointer to mkldnn memory. Actually there + * is a shared pointer that hold the memory either in NDArray or in MKLDNN + * stream. As long as we call these functions inside an operator, the return + * memory is always valid. + */ + /* * This function returns mkldnn::memory with the default primitive_desc. */ - std::shared_ptr GetMKLDNNData() const; + const mkldnn::memory *GetMKLDNNData() const; /* * This function returns mkldnn::memory with the given primitive_desc * as long as the array size meets the required size in the given primitive_desc. */ - std::shared_ptr GetMKLDNNData( + const mkldnn::memory *GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const; /* * This function returns mkldnn::memory with the given primitive_desc. * The returned mkldnn::memory will have the same physical layout as * the given primitive_desc. */ - std::shared_ptr GetMKLDNNDataReorder( + const mkldnn::memory *GetMKLDNNDataReorder( const mkldnn::memory::primitive_desc &desc) const; void CopyFrom(const mkldnn::memory &mem); - std::shared_ptr CreateMKLDNNData( + mkldnn::memory *CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); /* diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 087ea881e575..f9c70e7e3b8b 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -508,8 +508,8 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { * the given one. operator== can't guarantee that. == can return true even if * the formats are different. I need to double check its format. */ -static inline std::shared_ptr GetMKLDNNExact( - std::shared_ptr mem, mkldnn::memory::primitive_desc desc) { +static inline mkldnn::memory *GetMKLDNNExact( + mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) { auto src_desc = mem->get_primitive_desc(); if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) { return mem; @@ -517,11 +517,11 @@ static inline std::shared_ptr GetMKLDNNExact( std::shared_ptr ret(new mkldnn::memory( desc, mem->get_data_handle())); MKLDNNStream::Instance().RegisterMem(ret); - return ret; + return ret.get(); } } -std::shared_ptr NDArray::GetMKLDNNData( +const mkldnn::memory *NDArray::GetMKLDNNData( const mkldnn::memory::primitive_desc &desc) const { if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; @@ -540,13 +540,13 @@ std::shared_ptr NDArray::GetMKLDNNData( || (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2))) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return GetMKLDNNExact(ptr_->Mkl_mem_, desc); + return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } else { return nullptr; } } -std::shared_ptr NDArray::GetMKLDNNDataReorder( +const mkldnn::memory *NDArray::GetMKLDNNDataReorder( const mkldnn::memory::primitive_desc &desc) const { if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; @@ -564,7 +564,7 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( // We need to make sure Mkl_mem_ is always valid as well. stream.RegisterMem(ptr_->Mkl_mem_); if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { - return GetMKLDNNExact(ptr_->Mkl_mem_, desc); + return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } mkldnn::memory::primitive_desc _desc = desc; @@ -576,21 +576,20 @@ std::shared_ptr NDArray::GetMKLDNNDataReorder( desc2.data.format == GetDefaultFormat(desc2)) { mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); stream.RegisterMem(ret); - return ret; + return ret.get(); } else { - mkldnn_mem_ptr ret = TmpMemMgr::Instance().Alloc(desc); - stream.RegisterMem(ret); + auto ret = TmpMemMgr::Instance().Alloc(desc); stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; } } -std::shared_ptr NDArray::GetMKLDNNData() const { +const mkldnn::memory *NDArray::GetMKLDNNData() const { CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); ptr_->SetMKLMem(shape_, dtype_); if (ptr_->Mkl_mem_) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_; + return ptr_->Mkl_mem_.get(); } else { // We don't support converting sparse format. return nullptr; @@ -612,6 +611,7 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { MKLDNNStream &stream = MKLDNNStream::Instance(); ptr_->SetMKLMem(shape_, dtype_); + stream.RegisterMem(ptr_->Mkl_mem_); auto from_desc = mem.get_primitive_desc().desc(); auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); auto from_def_format = GetDefaultFormat(from_desc); @@ -634,7 +634,7 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { // In this case, the source memory stores data in a customized layout. We // need to reorganize the data in memory before we can reshape. auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format); - mkldnn_mem_ptr def_mem = CreateMKLDNNTempMem(def_pd); + auto def_mem = TmpMemMgr::Instance().Alloc(def_pd); MKLDNNStream &stream = MKLDNNStream::Instance(); stream.RegisterPrim(mkldnn::reorder(mem, *def_mem)); // Now we can reshape it @@ -652,8 +652,7 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { } } -std::shared_ptr NDArray::CreateMKLDNNData( - const mkldnn::memory::primitive_desc &desc) { +mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { mkldnn::memory::primitive_desc _desc = desc; auto required_format = _desc.desc().data.format; auto def_format = GetDefaultFormat(_desc.desc()); @@ -670,17 +669,17 @@ std::shared_ptr NDArray::CreateMKLDNNData( if (required_format == def_format) { ptr_->SetMKLMem(shape_, dtype_); MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return GetMKLDNNExact(ptr_->Mkl_mem_, desc); + return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return GetMKLDNNExact(ptr_->Mkl_mem_, desc); + return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } ptr_->Mkl_mem_ = mkldnn_mem_ptr(new mkldnn::memory(desc)); MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_; + return ptr_->Mkl_mem_.get(); } #endif diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index dafe6c69c594..ba3c8b6923e1 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -55,6 +55,9 @@ #include #include #include "mkldnn.hpp" +#include "mxnet/ndarray.h" +#include "mxnet/resource.h" +#include "mxnet/op_attr_types.h" using namespace mkldnn; namespace mxnet { extern bool EnableMkldnnWarnGenerated(); @@ -234,31 +237,13 @@ class TmpMemMgr { this->est_size = 0; } - mkldnn_mem_ptr Alloc(const mkldnn::memory::primitive_desc &pd) { - // We need to include the size of the memory used for alignment. - this->est_size += pd.get_size() + alignment; - void *this_mem = this->curr_mem; - void *mem = std::align(alignment, pd.get_size(), this_mem, this->curr_size); - if (mem) { - // The memory is allocated from the temporary memory space in the - // operator. It'll only become invalid after we exit from the operator. - mkldnn_mem_ptr ret(new mkldnn::memory(pd, this_mem)); - CHECK_EQ(this_mem, mem); - this->curr_size -= pd.get_size(); - this->curr_mem = static_cast(this_mem) + pd.get_size(); - return ret; - } else { - LOG(WARNING) << "Allocate " << pd.get_size() - << " bytes with malloc directly"; - return mkldnn_mem_ptr(new mkldnn::memory(pd)); - } - } + mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd); }; class MKLDNNStream { std::vector net; // Here we hold all memory related to the operators in the stream. - std::vector mem_holder; + std::vector > mem_holder; public: static MKLDNNStream &Instance() { @@ -268,7 +253,9 @@ class MKLDNNStream { void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } - void RegisterMem(mkldnn_mem_const_ptr mem) { mem_holder.push_back(mem); } + void RegisterMem(std::shared_ptr mem) { + mem_holder.push_back(mem); + } void Submit() { if (!net.empty()) @@ -279,33 +266,24 @@ class MKLDNNStream { } }; -inline static mkldnn_mem_ptr CreateMKLDNNTempMem( - const mkldnn::memory::primitive_desc &desc) { - mkldnn_mem_ptr ret = TmpMemMgr::Instance().Alloc(desc); - MKLDNNStream::Instance().RegisterMem(ret); - return ret; -} - enum OutDataOp { Noop, CopyBack, AddBack, }; -typedef std::pair mkldnn_output_t; +typedef std::pair mkldnn_output_t; static inline mkldnn_output_t CreateMKLDNNMem( const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req) { if (kAddTo == req) { auto tmp = TmpMemMgr::Instance().Alloc(desc); - MKLDNNStream::Instance().RegisterMem(tmp); return mkldnn_output_t(OutDataOp::AddBack, tmp); } else { - mkldnn_mem_ptr mem = const_cast(arr).CreateMKLDNNData(desc); + mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); if (mem == nullptr) { auto tmp = TmpMemMgr::Instance().Alloc(desc); - MKLDNNStream::Instance().RegisterMem(tmp); return mkldnn_output_t(OutDataOp::CopyBack, tmp); } else { return mkldnn_output_t(OutDataOp::Noop, mem); @@ -323,22 +301,20 @@ static inline void CommitOutput(const NDArray &arr, if (res.first == CopyBack) { const_cast(arr).CopyFrom(*res.second); } else if (res.first == AddBack) { - mkldnn_mem_const_ptr mem = - arr.GetMKLDNNData(res.second->get_primitive_desc()); + auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); CHECK(mem != nullptr); // We have to allocate new memory for the sum result. - mkldnn_mem_ptr sum_res = TmpMemMgr::Instance().Alloc( + auto sum_res = TmpMemMgr::Instance().Alloc( res.second->get_primitive_desc()); - MKLDNNStream::Instance().RegisterMem(sum_res); op::Sum(*res.second, *mem, *sum_res); const_cast(arr).CopyFrom(*sum_res); } } -inline static mkldnn_mem_const_ptr GetWeights( - const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups) { - mkldnn_mem_const_ptr mem; +inline static const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, + int num_groups) { + const mkldnn::memory *mem; mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Instance().get_engine(); if (arr.shape().ndim() == 2) { @@ -375,14 +351,14 @@ inline static mkldnn_mem_const_ptr GetWeights( } if (mem->get_primitive_desc() == target_pd) return mem; - std::shared_ptr ret = CreateMKLDNNTempMem(target_pd); + auto ret = TmpMemMgr::Instance().Alloc(target_pd); MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); return ret; } -inline static mkldnn_mem_const_ptr GetWeights(const NDArray &arr, - const mkldnn::engine &engine, - int num_groups = 1) { +inline static const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::engine &engine, + int num_groups = 1) { mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); if (arr.shape().ndim() == 2) { mkldnn::memory::dims tz = mkldnn::memory::dims{ diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc new file mode 100644 index 000000000000..8b4a8e7fa6ad --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "./mkldnn_base-inl.h" + +namespace mxnet { + +mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { + // We need to include the size of the memory used for alignment. + this->est_size += pd.get_size() + alignment; + void *this_mem = this->curr_mem; + void *mem = std::align(alignment, pd.get_size(), this_mem, this->curr_size); + if (mem) { + // The memory is allocated from the temporary memory space in the + // operator. It'll only become invalid after we exit from the operator. + mkldnn_mem_ptr ret(new mkldnn::memory(pd, this_mem)); + MKLDNNStream::Instance().RegisterMem(ret); + CHECK_EQ(this_mem, mem); + this->curr_size -= pd.get_size(); + this->curr_mem = static_cast(this_mem) + pd.get_size(); + return ret.get(); + } else { + LOG(WARNING) << "Allocate " << pd.get_size() + << " bytes with malloc directly"; + mkldnn_mem_ptr ret(new mkldnn::memory(pd)); + MKLDNNStream::Instance().RegisterMem(ret); + return ret.get(); + } +} + +} // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc index d15fd39ddf7a..5c9c23eb8a62 100644 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -42,9 +42,7 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); if (out_mem == nullptr) out_mem = out_data.GetMKLDNNData(); - mkldnn_mem_ptr sum_res - = TmpMemMgr::Instance().Alloc(out_mem->get_primitive_desc()); - MKLDNNStream::Instance().RegisterMem(sum_res); + auto sum_res = TmpMemMgr::Instance().Alloc(out_mem->get_primitive_desc()); Sum(*in_mem, *out_mem, *sum_res); const_cast(out_data).CopyFrom(*sum_res); } else { From c099520aa42e88d1ae25e51677cc87d171a77035 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 21:56:30 +0000 Subject: [PATCH 172/264] Move more code to mkldnn_base.cc --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 129 ++--------------------- src/operator/nn/mkldnn/mkldnn_base.cc | 115 ++++++++++++++++++++ 2 files changed, 125 insertions(+), 119 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index ba3c8b6923e1..bb00f300d9da 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -274,125 +274,16 @@ enum OutDataOp { typedef std::pair mkldnn_output_t; -static inline mkldnn_output_t CreateMKLDNNMem( - const NDArray &arr, const mkldnn::memory::primitive_desc &desc, - OpReqType req) { - if (kAddTo == req) { - auto tmp = TmpMemMgr::Instance().Alloc(desc); - return mkldnn_output_t(OutDataOp::AddBack, tmp); - } else { - mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); - if (mem == nullptr) { - auto tmp = TmpMemMgr::Instance().Alloc(desc); - return mkldnn_output_t(OutDataOp::CopyBack, tmp); - } else { - return mkldnn_output_t(OutDataOp::Noop, mem); - } - } -} - -namespace op { -void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, - const mkldnn::memory &out); -} - -static inline void CommitOutput(const NDArray &arr, - const mkldnn_output_t &res) { - if (res.first == CopyBack) { - const_cast(arr).CopyFrom(*res.second); - } else if (res.first == AddBack) { - auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); - CHECK(mem != nullptr); - // We have to allocate new memory for the sum result. - auto sum_res = TmpMemMgr::Instance().Alloc( - res.second->get_primitive_desc()); - op::Sum(*res.second, *mem, *sum_res); - const_cast(arr).CopyFrom(*sum_res); - } -} - -inline static const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::memory::primitive_desc &target_pd, - int num_groups) { - const mkldnn::memory *mem; - mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); - auto engine = CpuEngine::Instance().get_engine(); - if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, - static_cast(arr.shape()[0] / num_groups), - static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), - static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else { - LOG(FATAL) << "The weight array has an unsupported number of dimensions"; - return nullptr; - } - if (mem->get_primitive_desc() == target_pd) return mem; - - auto ret = TmpMemMgr::Instance().Alloc(target_pd); - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); - return ret; -} - -inline static const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::engine &engine, - int num_groups = 1) { - mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); - if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, - static_cast(arr.shape()[0] / num_groups), - static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), - static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else { - LOG(FATAL) << "The weight array has an unsupported number of dimensions"; - return nullptr; - } -} +mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req); +void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, + int num_groups); +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::engine &engine, + int num_groups = 1); } // namespace mxnet #endif diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 8b4a8e7fa6ad..ed9e0cadd691 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -18,6 +18,7 @@ */ #include "./mkldnn_base-inl.h" +#include "./mkldnn_ops-inl.h" namespace mxnet { @@ -44,4 +45,118 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { } } +mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req) { + if (kAddTo == req) { + auto tmp = TmpMemMgr::Instance().Alloc(desc); + return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else { + mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); + if (mem == nullptr) { + auto tmp = TmpMemMgr::Instance().Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + return mkldnn_output_t(OutDataOp::Noop, mem); + } + } +} + +void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { + if (res.first == CopyBack) { + const_cast(arr).CopyFrom(*res.second); + } else if (res.first == AddBack) { + auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + CHECK(mem != nullptr); + // We have to allocate new memory for the sum result. + auto sum_res = TmpMemMgr::Instance().Alloc( + res.second->get_primitive_desc()); + op::Sum(*res.second, *mem, *sum_res); + const_cast(arr).CopyFrom(*sum_res); + } +} + +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, + int num_groups) { + const mkldnn::memory *mem; + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); + auto engine = CpuEngine::Instance().get_engine(); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } + if (mem->get_primitive_desc() == target_pd) return mem; + + auto ret = TmpMemMgr::Instance().Alloc(target_pd); + MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; +} + +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::engine &engine, + int num_groups) { + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + return arr.GetMKLDNNData(pd); + } else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } +} + } // namespace mxnet From 04ec94523e04b9702bfc3ace14efbc753a5a3b63 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 12 Dec 2017 22:28:47 +0000 Subject: [PATCH 173/264] Fix a compilation error. --- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 8934e963d578..1b8733b3aa17 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -86,6 +86,9 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs); +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out); + } // namespace op } // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 From 9562d9669e679fbec52470444accb830157d6f4f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 00:28:41 +0000 Subject: [PATCH 174/264] Address review comments. --- src/ndarray/ndarray.cc | 53 +++++------ src/operator/nn/activation-inl.h | 14 +-- src/operator/nn/activation.cc | 24 ++--- src/operator/nn/batch_norm-inl.h | 94 ++++++++++--------- src/operator/nn/batch_norm.cc | 28 +++--- src/operator/nn/batch_norm.cu | 36 +++---- src/operator/nn/concat.cc | 12 +-- src/operator/nn/convolution.cc | 12 +-- src/operator/nn/cudnn/cudnn_activation-inl.h | 2 +- src/operator/nn/deconvolution.cc | 20 ++-- src/operator/nn/fully_connected.cc | 12 +-- src/operator/nn/mkldnn/mkldnn_act-inl.h | 32 ++++--- src/operator/nn/mkldnn/mkldnn_base-inl.h | 14 +-- src/operator/nn/mkldnn/mkldnn_base.cc | 16 ++-- src/operator/nn/mkldnn/mkldnn_concat.cc | 16 ++-- src/operator/nn/mkldnn/mkldnn_convolution.cc | 28 +++--- src/operator/nn/mkldnn/mkldnn_copy.cc | 6 +- .../nn/mkldnn/mkldnn_deconvolution.cc | 20 ++-- .../nn/mkldnn/mkldnn_fully_connected.cc | 24 ++--- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 16 ++-- src/operator/nn/mkldnn/mkldnn_softmax.cc | 6 +- src/operator/nn/mkldnn/mkldnn_sum.cc | 13 +-- src/operator/nn/pooling.cc | 12 +-- 23 files changed, 257 insertions(+), 253 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f9c70e7e3b8b..2dcfa230602f 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -242,12 +242,12 @@ static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, auto def_pd = GetPrimitiveDesc(mem->get_primitive_desc(), format); mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterMem(mem); - stream.RegisterMem(def_mem); - stream.RegisterPrim(mkldnn::reorder(*mem, *def_mem)); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterMem(mem); + stream->RegisterMem(def_mem); + stream->RegisterPrim(mkldnn::reorder(*mem, *def_mem)); if (submit_now) - stream.Submit(); + stream->Submit(); return def_mem; } @@ -488,7 +488,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { case 5: layout = mkldnn::memory::format::goihw; break; } mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; - auto cpu_engine = CpuEngine::Instance().get_engine(); + auto cpu_engine = CpuEngine::Get()->get_engine(); // If the storage type is the default type, we can just simply // reference to the memory for the default storage. if (storage_type == kDefaultStorage) { @@ -516,7 +516,7 @@ static inline mkldnn::memory *GetMKLDNNExact( } else { std::shared_ptr ret(new mkldnn::memory( desc, mem->get_data_handle())); - MKLDNNStream::Instance().RegisterMem(ret); + MKLDNNStream::Get()->RegisterMem(ret); return ret.get(); } } @@ -539,7 +539,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData( if (ptr_->Mkl_mem_->get_primitive_desc() == desc || (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2))) { - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } else { return nullptr; @@ -560,9 +560,9 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( // have been initialized since we are trying to get data from the array. CHECK(ptr_->Mkl_mem_ != nullptr); // If the memory descriptor matches, it's easy. - MKLDNNStream &stream = MKLDNNStream::Instance(); + MKLDNNStream *stream = MKLDNNStream::Get(); // We need to make sure Mkl_mem_ is always valid as well. - stream.RegisterMem(ptr_->Mkl_mem_); + stream->RegisterMem(ptr_->Mkl_mem_); if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } @@ -575,11 +575,11 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( if (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2)) { mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); - stream.RegisterMem(ret); + stream->RegisterMem(ret); return ret.get(); } else { - auto ret = TmpMemMgr::Instance().Alloc(desc); - stream.RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); + auto ret = TmpMemMgr::Get()->Alloc(desc); + stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); return ret; } } @@ -588,7 +588,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); ptr_->SetMKLMem(shape_, dtype_); if (ptr_->Mkl_mem_) { - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); } else { // We don't support converting sparse format. @@ -609,9 +609,9 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { return; } - MKLDNNStream &stream = MKLDNNStream::Instance(); + MKLDNNStream *stream = MKLDNNStream::Get(); ptr_->SetMKLMem(shape_, dtype_); - stream.RegisterMem(ptr_->Mkl_mem_); + stream->RegisterMem(ptr_->Mkl_mem_); auto from_desc = mem.get_primitive_desc().desc(); auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); auto from_def_format = GetDefaultFormat(from_desc); @@ -628,15 +628,14 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { mkldnn::memory::desc data_md(dims, this_dtype, this_format); mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); - stream.RegisterMem(tmp_mem); - stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { // In this case, the source memory stores data in a customized layout. We // need to reorganize the data in memory before we can reshape. auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format); - auto def_mem = TmpMemMgr::Instance().Alloc(def_pd); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterPrim(mkldnn::reorder(mem, *def_mem)); + auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); + stream->RegisterPrim(mkldnn::reorder(mem, *def_mem)); // Now we can reshape it mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims); @@ -645,10 +644,10 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { mkldnn::memory::desc data_md(dims, this_dtype, this_format); mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle())); - stream.RegisterMem(tmp_mem); - stream.RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); } else { - stream.RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); } } @@ -668,17 +667,17 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc & // If the shape isn't the same, it actually implicitly reshapes data. if (required_format == def_format) { ptr_->SetMKLMem(shape_, dtype_); - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } ptr_->Mkl_mem_ = mkldnn_mem_ptr(new mkldnn::memory(desc)); - MKLDNNStream::Instance().RegisterMem(ptr_->Mkl_mem_); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); } #endif diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index 9b82c83ca0e8..a1e2423ac0df 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -101,8 +101,8 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, } template -void _ActivationCompute(const ActivationParam ¶m, const OpContext &ctx, - const TBlob &input, OpReqType req, const TBlob &output) { +void ActivationComputeImpl(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &input, OpReqType req, const TBlob &output) { MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, { switch (param.act_type) { case activation::kReLU: @@ -128,9 +128,9 @@ void _ActivationCompute(const ActivationParam ¶m, const OpContext &ctx, } template -void _ActivationGradCompute(const ActivationParam ¶m, const OpContext &ctx, - const TBlob &out_grad, const TBlob &out_data, - OpReqType req, const TBlob &output) { +void ActivationGradComputeImpl(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &out_grad, const TBlob &out_data, + OpReqType req, const TBlob &output) { MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { switch (param.act_type) { case activation::kReLU: @@ -164,7 +164,7 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); - _ActivationCompute(param, ctx, inputs[0], req[0], outputs[0]); + ActivationComputeImpl(param, ctx, inputs[0], req[0], outputs[0]); } template @@ -181,7 +181,7 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); - _ActivationGradCompute(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); + ActivationGradComputeImpl(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); } } // namespace op diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 70fe88bc8753..876756cd7ec4 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -60,11 +60,11 @@ static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNAct_Forward(ctx, param, inputs[0], req[0], outputs[0]); + MKLDNNActivationForward(ctx, param, inputs[0], req[0], outputs[0]); return; } #endif - _ActivationCompute(param, ctx, inputs[0].data(), req[0], outputs[0].data()); + ActivationComputeImpl(param, ctx, inputs[0].data(), req[0], outputs[0].data()); } void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, @@ -80,13 +80,13 @@ void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNAct_Backward(ctx, param, inputs[0], inputs[1], req[0], - outputs[0]); + MKLDNNActivationBackward(ctx, param, inputs[0], inputs[1], req[0], + outputs[0]); return; } #endif - _ActivationGradCompute(param, ctx, inputs[0].data(), inputs[1].data(), - req[0], outputs[0].data()); + ActivationGradComputeImpl(param, ctx, inputs[0].data(), inputs[1].data(), + req[0], outputs[0].data()); } inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, @@ -111,11 +111,11 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, dispatch_mode, in_attrs, out_attrs); } -inline static bool backward_ActStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { #if MXNET_USE_CUDNN == 1 CHECK_EQ(in_attrs->size(), 3U); #else @@ -164,7 +164,7 @@ NNVM_REGISTER_OP(_backward_Activation) .set_num_inputs(3) .set_num_outputs(1) .set_attr("TIsBackward", true) -.set_attr("FInferStorageType", backward_ActStorageType) +.set_attr("FInferStorageType", BackwardActStorageType) .set_attr("FInferShape", ElemwiseShape<3, 1>) .set_attr("FInferType", ElemwiseType<3, 1>) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index a6b11fc647f6..22234dec0699 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -91,40 +91,40 @@ static inline bool IsBNWriting(const OpReqType ort) { } template -void DoBNForward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); template -void DoBNBackward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); #if MXNET_USE_CUDA template -void DoBNForward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); template -void DoBNBackward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); #endif // MXNET_USE_CUDA /*! @@ -139,11 +139,11 @@ void DoBNBackward(mshadow::Stream *stream, * \sa OpReqType, OpContext */ template -void BNForward(const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormForward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; @@ -158,7 +158,8 @@ void BNForward(const OpContext &ctx, const BatchNormParam& param, CHECK_EQ(req[batchnorm::kOut], kWriteTo); } Stream *s = ctx.get_stream(); - DoBNForward(s, ctx, param, in_data, req, out_data, aux_states); + BatchNormForwardImpl(s, ctx, param, in_data, req, + out_data, aux_states); } /*! @@ -190,20 +191,20 @@ void BNForward(const OpContext &ctx, const BatchNormParam& param, * \sa OperatorProperty, OpReqType, OpContext */ template -void BNBackward(const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); CHECK_EQ(in_data.size(), 3U); CHECK_EQ(out_data.size(), 3U); CHECK_EQ(in_grad.size(), 3U); mshadow::Stream *s = ctx.get_stream(); - DoBNBackward(s, ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); + BatchNormBackwardImpl(s, ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); } template @@ -218,7 +219,8 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - BNForward(ctx, param, in_data, req, outputs, aux_states); + BatchNormForward(ctx, param, in_data, req, outputs, + aux_states); }); } @@ -242,8 +244,8 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, std::vector in_grad(outputs.begin(), outputs.begin() + 3); MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - BNBackward(ctx, param, out_grad, in_data, out_data, req, - in_grad, aux_states); + BatchNormBackward(ctx, param, out_grad, in_data, out_data, req, + in_grad, aux_states); }); } diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index bb5a70658d21..04970da68373 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -90,12 +90,12 @@ static inline void ForEachFast(const BNTensor3 &in_data, /*! \brief Forward CPU */ template -void DoBNForward(mshadow::Stream *, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormForwardImpl(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { // Input batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -190,14 +190,14 @@ void DoBNForward(mshadow::Stream *, } template -void DoBNBackward(mshadow::Stream *, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormBackwardImpl(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { // Input Data batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 682c286f4a3a..80c15976b65f 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx, /*! \brief Forward batch-norm pass on GPU */ template -void DoBNForward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationUpdateOutput( stream, ctx, @@ -614,14 +614,14 @@ void DoBNForward(mshadow::Stream *stream, /*! \brief Backward batch-norm pass on GPU */ template -void DoBNBackward(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationBackward( stream, ctx, @@ -671,12 +671,12 @@ void BatchNormCompute(const nnvm::NodeAttrs& attrs, }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - BNForward(ctx, param, in_data, req, outputs, aux_states); + BatchNormForward(ctx, param, in_data, req, outputs, aux_states); }) } #else MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - BNForward(ctx, param, in_data, req, outputs, aux_states); + BatchNormForward(ctx, param, in_data, req, outputs, aux_states); }); #endif } @@ -706,13 +706,13 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - BNBackward(ctx, param, out_grad, + BatchNormBackward(ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); }) } #else MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - BNBackward(ctx, param, out_grad, + BatchNormBackward(ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); }); #endif diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index 7de5706bf956..0828c0f037c9 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -126,11 +126,11 @@ inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, return true; } -inline static bool backward_ConcatStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { #if MXNET_USE_MKLDNN == 1 CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); if (dev_mask == mshadow::cpu::kDevMask @@ -287,7 +287,7 @@ NNVM_REGISTER_OP(_backward_Concat) }) #endif .set_attr("TIsBackward", true) -.set_attr("FInferStorageType", backward_ConcatStorageType) +.set_attr("FInferStorageType", BackwardConcatStorageType) .set_attr("FCompute", ConcatGradCompute) .set_attr("FComputeEx", ConcatGradComputeExCPU); diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index e354a4536103..d50b9747eff0 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -319,11 +319,11 @@ inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, return true; } -inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 3 : 4; uint32_t out_expected = param.no_bias ? 2 : 3; @@ -515,7 +515,7 @@ NNVM_REGISTER_OP(_backward_Convolution) return params.no_bias ? 2 : 3; }) .set_attr("TIsBackward", true) -.set_attr("FInferStorageType", backward_ConvStorageType) +.set_attr("FInferStorageType", BackwardConvStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h index 35827917c7d5..a89e7bfaf080 100644 --- a/src/operator/nn/cudnn/cudnn_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_activation-inl.h @@ -41,6 +41,7 @@ class CuDNNActivationOp { nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); #endif + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); } void Init(const ActivationParam ¶m) { @@ -62,7 +63,6 @@ class CuDNNActivationOp { #if CUDNN_MAJOR >= 5 CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_)); #endif - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); } ~CuDNNActivationOp() { diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index ac241717d1d9..086f359931b2 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -257,10 +257,10 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, } inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 2 : 3; CHECK_EQ(in_attrs->size(), in_expected); @@ -283,11 +283,11 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, return true; } -inline static bool backward_DeconvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const DeconvolutionParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U); @@ -434,7 +434,7 @@ NNVM_REGISTER_OP(_backward_Deconvolution) return params.no_bias ? 2 : 3; }) .set_attr("TIsBackward", true) -.set_attr("FInferStorageType", backward_DeconvStorageType) +.set_attr("FInferStorageType", BackwardDeconvStorageType) .set_attr("FResourceRequest", [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 805ffbea2ba0..1770c459b1c8 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -152,11 +152,11 @@ inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, return true; } -inline static bool backward_FCStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const FullyConnectedParam& param = nnvm::get(attrs.parsed); uint32_t out_expected = param.no_bias ? 2 : 3; CHECK_EQ(in_attrs->size(), 3U); @@ -254,7 +254,7 @@ NNVM_REGISTER_OP(_backward_FullyConnected) .set_attr("FInplaceOption", [](const NodeAttrs& attrs){ return std::vector >{{1, 0}}; }) -.set_attr("FInferStorageType", backward_FCStorageType) +.set_attr("FInferStorageType", BackwardFCStorageType) .set_attr_parser(ParamParser) .set_attr("FCompute", FullyConnectedGradCompute) .set_attr("FComputeEx", FullyConnectedGradCompute_CPU); diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index e2f9989b8687..46711f87a796 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -70,9 +70,9 @@ static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { } template -void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { +void MKLDNNActivationForward(const OpContext &ctx, const ActivationParam& param, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); @@ -89,15 +89,15 @@ void MKLDNNAct_Forward(const OpContext &ctx, const ActivationParam& param, auto output_memory = const_cast(out_data).CreateMKLDNNData( pdesc.dst_primitive_desc()); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); - stream.Submit(); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); + stream->Submit(); } template -void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, - const NDArray &out_grad, const NDArray &in_data, - const OpReqType &req, const NDArray &in_grad) { +void MKLDNNActivationBackward(const OpContext &ctx, const ActivationParam& param, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad) { if (req == kNullOp) { return; } @@ -109,7 +109,7 @@ void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; - TmpMemMgr::Instance().Init(ctx.requested[activation::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, @@ -118,12 +118,14 @@ void MKLDNNAct_Backward(const OpContext &ctx, const ActivationParam& param, mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); - auto diff_src_memory = CreateMKLDNNMem(in_grad, bw_pdesc.diff_src_primitive_desc(), req); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, - *diff_dst_memory, *diff_src_memory.second)); + auto diff_src_memory = CreateMKLDNNMem(in_grad, + bw_pdesc.diff_src_primitive_desc(), req); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + *diff_dst_memory, + *diff_src_memory.second)); CommitOutput(in_grad, diff_src_memory); - stream.Submit(); + stream->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index bb00f300d9da..a4adb275e7f7 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -65,10 +65,10 @@ extern bool EnableMkldnnWarnGenerated(); // cpu_engine singleton class CpuEngine { public: - static CpuEngine &Instance() { + static CpuEngine *Get() { // I's thread-safe in C++11. static thread_local CpuEngine myInstance; - return myInstance; + return &myInstance; } CpuEngine(CpuEngine const &) = delete; // Copy construct CpuEngine(CpuEngine &&) = delete; // Move construct @@ -204,9 +204,9 @@ class TmpMemMgr { const size_t alignment = 4096; public: - static TmpMemMgr &Instance() { + static TmpMemMgr *Get() { static thread_local TmpMemMgr mgr; - return mgr; + return &mgr; } TmpMemMgr() { @@ -246,9 +246,9 @@ class MKLDNNStream { std::vector > mem_holder; public: - static MKLDNNStream &Instance() { + static MKLDNNStream *Get() { static thread_local MKLDNNStream stream; - return stream; + return &stream; } void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } @@ -262,7 +262,7 @@ class MKLDNNStream { mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); net.clear(); mem_holder.clear(); - TmpMemMgr::Instance().Reset(); + TmpMemMgr::Get()->Reset(); } }; diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index ed9e0cadd691..ff77c3a1d221 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -31,7 +31,7 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { // The memory is allocated from the temporary memory space in the // operator. It'll only become invalid after we exit from the operator. mkldnn_mem_ptr ret(new mkldnn::memory(pd, this_mem)); - MKLDNNStream::Instance().RegisterMem(ret); + MKLDNNStream::Get()->RegisterMem(ret); CHECK_EQ(this_mem, mem); this->curr_size -= pd.get_size(); this->curr_mem = static_cast(this_mem) + pd.get_size(); @@ -40,7 +40,7 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { LOG(WARNING) << "Allocate " << pd.get_size() << " bytes with malloc directly"; mkldnn_mem_ptr ret(new mkldnn::memory(pd)); - MKLDNNStream::Instance().RegisterMem(ret); + MKLDNNStream::Get()->RegisterMem(ret); return ret.get(); } } @@ -49,12 +49,12 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req) { if (kAddTo == req) { - auto tmp = TmpMemMgr::Instance().Alloc(desc); + auto tmp = TmpMemMgr::Get()->Alloc(desc); return mkldnn_output_t(OutDataOp::AddBack, tmp); } else { mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); if (mem == nullptr) { - auto tmp = TmpMemMgr::Instance().Alloc(desc); + auto tmp = TmpMemMgr::Get()->Alloc(desc); return mkldnn_output_t(OutDataOp::CopyBack, tmp); } else { return mkldnn_output_t(OutDataOp::Noop, mem); @@ -69,7 +69,7 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); CHECK(mem != nullptr); // We have to allocate new memory for the sum result. - auto sum_res = TmpMemMgr::Instance().Alloc( + auto sum_res = TmpMemMgr::Get()->Alloc( res.second->get_primitive_desc()); op::Sum(*res.second, *mem, *sum_res); const_cast(arr).CopyFrom(*sum_res); @@ -81,7 +81,7 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) { const mkldnn::memory *mem; mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); if (arr.shape().ndim() == 2) { mkldnn::memory::dims tz = mkldnn::memory::dims{ static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; @@ -116,8 +116,8 @@ const mkldnn::memory *GetWeights(const NDArray &arr, } if (mem->get_primitive_desc() == target_pd) return mem; - auto ret = TmpMemMgr::Instance().Alloc(target_pd); - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder(*mem, *ret)); + auto ret = TmpMemMgr::Get()->Alloc(target_pd); + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); return ret; } diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index 304e01ad845b..56fb473b4fe0 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -33,7 +33,7 @@ namespace op { void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { - TmpMemMgr::Instance().Init(ctx.requested[concat_enum::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; int concat_dim = param.dim; @@ -46,22 +46,22 @@ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, data_mem.push_back(*tmp_mem); } mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); + MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); CommitOutput(out_data[concat_enum::kOut], out_mem); - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - TmpMemMgr::Instance().Init(ctx.requested[concat_enum::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; int axis_ = param.dim; - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); auto gz_mem = inputs[0].GetMKLDNNData(); mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); /* init the offset */ @@ -81,11 +81,11 @@ void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, mkldnn::reorder::primitive_desc reorder_pd( view_pd.get()->dst_primitive_desc(), diff_src_mpd); offsets[axis_] += diff_src_tz[axis_]; - MKLDNNStream::Instance().RegisterPrim(mkldnn::reorder( + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder( reorder_pd, *gz_mem, *gradi_mem_.second)); CommitOutput(outputs[i], gradi_mem_); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 56d4bd557bcb..c2b01c61da5c 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -38,7 +38,7 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwd( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -87,7 +87,7 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -122,7 +122,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -169,36 +169,36 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector &in_data, const std::vector &req, const std::vector &out_data) { - TmpMemMgr::Instance().Init(ctx.requested[conv::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); const ConvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd_pd.src_primitive_desc()); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); auto weight_mem = GetWeights(in_data[conv::kWeight], fwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd_pd.dst_primitive_desc(), req[conv::kOut]); if (param.no_bias) { - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, *out_mem.second)); } else { auto bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd_pd.bias_primitive_desc()); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(fwd_pd, + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(fwd_pd, *data_mem, *weight_mem, *bias_mem, *out_mem.second)); } CommitOutput(out_data[conv::kOut], out_mem); - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - TmpMemMgr::Instance().Init(ctx.requested[conv::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); const std::vector &in_grad = outputs; - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); const ConvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, inputs[conv::kData + 1], inputs[conv::kWeight + 1], @@ -215,7 +215,7 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c bwdData_pd.weights_primitive_desc(), param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], bwdData_pd.diff_src_primitive_desc(), req[conv::kData]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); CommitOutput(in_grad[conv::kData], in_grad_mem); } @@ -233,20 +233,20 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c req[conv::kWeight]); mkldnn_output_t in_grad_bias; if (param.no_bias) { - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], bwdWeights_pd.diff_bias_primitive_desc(), req[conv::kBias]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, *in_grad_bias.second)); } CommitOutput(in_grad[conv::kWeight], in_grad_weight); CommitOutput(in_grad[conv::kBias], in_grad_bias); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc index 5c9c23eb8a62..19350ed69290 100644 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -36,19 +36,19 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &out_data) { auto in_mem = in_data.GetMKLDNNData(); if (req == kAddTo) { - TmpMemMgr::Instance().Init(ctx.requested[0]); + TmpMemMgr::Get()->Init(ctx.requested[0]); // We should try and force the output memory has the same format // as the input memory. If not, we'll have to reorder memory. auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); if (out_mem == nullptr) out_mem = out_data.GetMKLDNNData(); - auto sum_res = TmpMemMgr::Instance().Alloc(out_mem->get_primitive_desc()); + auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc()); Sum(*in_mem, *out_mem, *sum_res); const_cast(out_data).CopyFrom(*sum_res); } else { const_cast(out_data).CopyFrom(*in_mem); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 251f341d32da..d0605e3d448b 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -65,7 +65,7 @@ static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -95,7 +95,7 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -122,7 +122,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( auto data_md = GetMemDesc(data); auto weight_md = GetWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::memory::dims strides{0, 0}; if (param.stride.ndim() == 2) { strides[0] = param.stride[0]; @@ -154,7 +154,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { - TmpMemMgr::Instance().Init(ctx.requested[deconv::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const DeconvolutionParam& param = nnvm::get(attrs.parsed); mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd( @@ -167,10 +167,10 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_data( + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data( deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); CommitOutput(out_data[deconv::kOut], out_mem); - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); // add bias, broadcast bias to dim 1: channel if (!param.no_bias) { // MKLDNN only supports float right now. @@ -188,7 +188,7 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - TmpMemMgr::Instance().Init(ctx.requested[deconv::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const std::vector &in_grad = outputs; const DeconvolutionParam& param = nnvm::get(attrs.parsed); CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; @@ -202,7 +202,7 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext bwdData_pd.weights_primitive_desc(), param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], bwdData_pd.dst_primitive_desc(), req[deconv::kData]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_forward(bwdData_pd, + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); CommitOutput(in_grad[deconv::kData], in_grad_mem); } @@ -216,11 +216,11 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext bwdWeights_pd.diff_dst_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::convolution_backward_weights( + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); CommitOutput(in_grad[deconv::kWeight], in_grad_weight); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); if (!param.no_bias) { typedef float DType; Stream *s = ctx.get_stream(); diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index d79a7b6203eb..cbb078f620f6 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -35,7 +35,7 @@ inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( const mkldnn::memory::desc &out_md) { auto data_md = GetMemDesc(data); auto weight_md = GetMemDesc(weight); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); if (bias) { auto bias_md = GetMemDesc(*bias); mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, @@ -54,7 +54,7 @@ inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( auto data_md = GetMemDesc(data); auto weight_md = GetMemDesc(weight); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd); } @@ -65,7 +65,7 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei auto data_md = GetMemDesc(data); auto weight_md = GetMemDesc(weight); auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); if (bias) { auto bias_md = GetMemDesc(*bias); mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, @@ -83,7 +83,7 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { - TmpMemMgr::Instance().Init(ctx.requested[fullc::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = in_data[fullc::kData].shape(); const TShape& oshape = out_data[fullc::kOut].shape(); @@ -112,21 +112,21 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); if (param.no_bias) { - MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward( + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward( ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); } else { auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); - MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, *data_mem, *weight_mem, *bias_mem, *out_mem.second)); } CommitOutput(out_data[fullc::kOut], out_mem); - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - TmpMemMgr::Instance().Init(ctx.requested[fullc::kTempSpace]); + TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = inputs[fullc::kData + 1].shape(); @@ -160,7 +160,7 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], ipBwdData_pd.diff_src_primitive_desc(), req[fullc::kData]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_data( + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data( ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); CommitOutput(in_grad[fullc::kData], in_grad_mem); } @@ -175,19 +175,19 @@ void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, ipBwdWeights_pd.diff_weights_primitive_desc(), req[fullc::kWeight]); mkldnn_output_t in_grad_bias; if (param.no_bias) { - MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], ipBwdWeights_pd.diff_bias_primitive_desc(), req[fullc::kBias]); - MKLDNNStream::Instance().RegisterPrim(mkldnn::inner_product_backward_weights( + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, *in_grad_bias.second)); } CommitOutput(in_grad[fullc::kWeight], in_grad_weight); CommitOutput(in_grad[fullc::kBias], in_grad_bias); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index cc6455efd1eb..825ff5600219 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -86,7 +86,7 @@ inline static pooling_forward::primitive_desc GetPoolingFwd( auto pad_l_ = param.pad[1], pad_r_ = param.pad[1]; auto stride_h_ = param.stride[0], stride_w_ = param.stride[1]; - auto engine = CpuEngine::Instance().get_engine(); + auto engine = CpuEngine::Get()->get_engine(); if (param.global_pool) { CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) << "With Global_pooling: true; only pad = 0 and stride = 1"; @@ -141,13 +141,13 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, if (ctx.is_train && MKLDNNRequireWorkspace(param)) { CHECK(workspace != nullptr); auto workspace_mem = workspace->GetMKLDNNData(); - MKLDNNStream::Instance().RegisterPrim( + MKLDNNStream::Get()->RegisterPrim( pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem)); } else { - MKLDNNStream::Instance().RegisterPrim( + MKLDNNStream::Get()->RegisterPrim( pooling_forward(pdesc, *input_mem, *output_memory)); } - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, @@ -158,7 +158,7 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, return; } - TmpMemMgr::Instance().Init(ctx.requested[0]); + TmpMemMgr::Get()->Init(ctx.requested[0]); auto diff_dst_mem = out_grad.GetMKLDNNData(); auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); @@ -207,15 +207,15 @@ void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, if (MKLDNNRequireWorkspace(param)) { CHECK(workspace != nullptr); auto workspace_mem = workspace->GetMKLDNNData(); - MKLDNNStream::Instance().RegisterPrim( + MKLDNNStream::Get()->RegisterPrim( pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), *diff_src_mem.second)); } else { - MKLDNNStream::Instance().RegisterPrim( + MKLDNNStream::Get()->RegisterPrim( pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second)); } CommitOutput(in_grad, diff_src_mem); - MKLDNNStream::Instance().Submit(); + MKLDNNStream::Get()->Submit(); } } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc index d87e0fd2c8fe..d8a2ab7ce1b8 100644 --- a/src/operator/nn/mkldnn/mkldnn_softmax.cc +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -45,9 +45,9 @@ void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine); auto output_memory = out_data.GetMKLDNNData(); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); - stream.Submit(); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); + stream->Submit(); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index 275640888adf..cbeb405a429b 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -32,7 +32,7 @@ namespace mxnet { namespace op { void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, - const mkldnn::memory &out) { + const mkldnn::memory &out) { std::vector input_pds(2); std::vector scales(2); std::vector inputs; @@ -45,11 +45,12 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, inputs.push_back(arr2); // TODO(zhengda) I need to reorder memory here. mkldnn::sum::primitive_desc sum_pd(scales, input_pds); - MKLDNNStream::Instance().RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); + MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); } void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const OpReqType &req, const NDArray &out_data) { + const std::vector &inputs, const OpReqType &req, + const NDArray &out_data) { std::vector in_prims; std::vector in_pds(inputs.size()); std::vector scales(inputs.size()); @@ -63,9 +64,9 @@ void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto output_memory = const_cast(out_data).CreateMKLDNNData( pdesc.dst_primitive_desc()); - MKLDNNStream &stream = MKLDNNStream::Instance(); - stream.RegisterPrim(mkldnn::sum(pdesc, in_prims, *output_memory)); - stream.Submit(); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *output_memory)); + stream->Submit(); } } // namespace op diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 5351110f198a..47dfb771114a 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -326,11 +326,11 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, return true; } -inline static bool backward_PoolingStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { +inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const PoolingParam ¶m = nnvm::get(attrs.parsed); CHECK_EQ(in_attrs->size(), GetNumBackInputs(param)); CHECK_EQ(out_attrs->size(), 1); @@ -435,7 +435,7 @@ NNVM_REGISTER_OP(_backward_Pooling) }) #endif .set_attr("FInferStorageType", - backward_PoolingStorageType) + BackwardPoolingStorageType) .set_attr_parser(PoolingParamParser) .set_attr("FCompute", PoolingGradCompute) .set_attr("FComputeEx", PoolingGradCompute_CPU); From 5842d566cadaf1d4cc2488f6b93fa9b44a6e767e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 02:02:04 +0000 Subject: [PATCH 175/264] fix a bug in activation backward. --- src/operator/nn/activation.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 876756cd7ec4..b137de9337fc 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -127,7 +127,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param) // There is no reason to use MKLDNN activation if the input isn't in // MKLDNN format. - && in_attrs->at(0) == kMKLDNNStorage) { + && (in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; From 4811bf2005cb434375e827fbc3b50cf48407ab8b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 02:06:05 +0000 Subject: [PATCH 176/264] Miss a macro in mkldnn_base.cc --- src/operator/nn/mkldnn/mkldnn_base.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index ff77c3a1d221..566379e3ad91 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -17,6 +17,8 @@ * under the License. */ +#if MXNET_USE_MKLDNN == 1 + #include "./mkldnn_base-inl.h" #include "./mkldnn_ops-inl.h" @@ -160,3 +162,5 @@ const mkldnn::memory *GetWeights(const NDArray &arr, } } // namespace mxnet + +#endif From 7735fb6626767cbda2bd70936040cc9e1a795719 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 02:07:43 +0000 Subject: [PATCH 177/264] Fix a bug in data iterator in examples. --- example/image-classification/common/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py index dc8915cda4c8..05f5ddc4506e 100755 --- a/example/image-classification/common/data.py +++ b/example/image-classification/common/data.py @@ -112,7 +112,8 @@ def get_rec_iter(args, kv=None): image_shape = tuple([int(l) for l in args.image_shape.split(',')]) if 'benchmark' in args and args.benchmark: data_shape = (args.batch_size,) + image_shape - train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32) + train = SyntheticDataIter(args.num_classes, data_shape, + args.num_examples / args.batch_size, np.float32) return (train, None) if kv: (rank, nworker) = (kv.rank, kv.num_workers) From b77ceb33cf449727bcaa0964ce238f0d2bb4da1a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 18:11:07 +0000 Subject: [PATCH 178/264] Avoid memory allocation in ReshapeMKLDNN. --- src/ndarray/ndarray.cc | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 2dcfa230602f..a6312fca7986 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -251,6 +251,11 @@ static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, return def_mem; } +struct EmptyMKLDNNDeleter { + void operator()(mkldnn::memory *mem) { + } +}; + NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; CHECK_GE(shape_.Size(), shape.Size()) @@ -264,7 +269,22 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { CHECK(ptr_->Mkl_mem_ != nullptr); // We shouldn't submit the reorder primitive here because submit will // be called in operators. - ret.ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_, false); + auto format = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc()); + if (format == ptr_->Mkl_mem_->get_primitive_desc().desc().data.format) { + ret.ptr_->Mkl_mem_ = ptr_->Mkl_mem_; + } else { + auto def_pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), format); + auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterMem(ptr_->Mkl_mem_); + stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *def_mem)); + // def_mem points to a memory region in the temp space. It's only valid + // inside an operator. As such, the returned NDArray can only be valid + // inside an operator and the shared point doesn't need to do anything + // when it's destroyed. + ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, + EmptyMKLDNNDeleter()); + } return ret; } LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; From 724631cfa17ce595087f4c1360551034b901a0d9 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 18:11:42 +0000 Subject: [PATCH 179/264] Avoid memory allocation in storage cast. --- src/operator/tensor/cast_storage.cc | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 0ba1efc700de..f09e15c90f46 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -43,8 +43,31 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) CHECK_EQ(src.dtype(), dns->type_flag_); // This converts the source data to the default format and copy the data to // the destination. - const TBlob &src_blob = src.data(); - memcpy(dns->dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns->type_flag_)); + std::vector net; + auto src_mkldnn = src.GetMKLDNNData(); + auto src_pd = src_mkldnn->get_primitive_desc(); + mkldnn::memory::dims dims(dns->shape_.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = dns->shape_[i]; + mkldnn::memory::format format = mkldnn::memory::format::format_undef; + switch (dims.size()) { + case 1: format = mkldnn::memory::format::x; break; + case 2: format = mkldnn::memory::format::nc; break; + case 4: format = mkldnn::memory::format::nchw; break; + // This isn't the right layout when the data has 5 dimensions in MXNet. + // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have + // a corresponding format. + case 5: format = mkldnn::memory::format::goihw; break; + } + CHECK_NE(format, mkldnn::memory::format::format_undef); + mkldnn::memory::format cpp_format = static_cast(format); + mkldnn::memory::data_type cpp_type = static_cast( + src_pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn::memory::primitive_desc dst_pd(data_md, src_pd.get_engine()); + mkldnn::memory dst_mkldnn(dst_pd, dns->dptr_); + net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); } void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { From 80a81a4b17872a4ed30bc9f868a7caa0546ab986 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 18:27:47 +0000 Subject: [PATCH 180/264] Fix a bug in cast storage. --- src/ndarray/ndarray.cc | 74 ------------------------ src/operator/nn/mkldnn/mkldnn_base-inl.h | 5 ++ src/operator/nn/mkldnn/mkldnn_base.cc | 74 ++++++++++++++++++++++++ src/operator/tensor/cast_storage.cc | 25 ++------ 4 files changed, 83 insertions(+), 95 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a6312fca7986..2f91040f31de 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -160,80 +160,6 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { #if MXNET_USE_MKLDNN == 1 -static inline mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { - if (desc.data.ndims == 1) { - return desc.data.format; - } else if (desc.data.ndims == 2) { - if (desc.data.format == mkldnn_io) - return mkldnn_oi; - else - return desc.data.format; - } else if (desc.data.ndims == 4) { - switch (desc.data.format) { - case mkldnn_nchw: - case mkldnn_nhwc: - case mkldnn_chwn: - case mkldnn_nChw8c: - case mkldnn_nChw16c: - return mkldnn_nchw; - case mkldnn_oihw: - case mkldnn_ihwo: - case mkldnn_hwio: - case mkldnn_OIhw8i8o: - case mkldnn_OIhw16i16o: - case mkldnn_OIhw8i16o2i: - case mkldnn_OIhw8o16i2o: - case mkldnn_OIhw8o8i: - case mkldnn_OIhw16o16i: - case mkldnn_IOhw16o16i: - case mkldnn_Oihw8o: - case mkldnn_Oihw16o: - case mkldnn_Ohwi8o: - case mkldnn_Ohwi16o: - case mkldnn_OhIw16o4i: - return mkldnn_oihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } else if (desc.data.ndims == 5) { - switch (desc.data.format) { - case mkldnn_goihw: - case mkldnn_gOIhw8i8o: - case mkldnn_gOIhw16i16o: - case mkldnn_gOIhw8i16o2i: - case mkldnn_gOIhw8o16i2o: - case mkldnn_gOIhw8o8i: - case mkldnn_gOIhw16o16i: - case mkldnn_gIOhw16o16i: - case mkldnn_gOihw8o: - case mkldnn_gOihw16o: - case mkldnn_gOhwi8o: - case mkldnn_gOhwi16o: - case mkldnn_gOhIw16o4i: - return mkldnn_goihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } else { - LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; - return mkldnn_format_undef; - } -} - -static inline mkldnn::memory::primitive_desc GetPrimitiveDesc( - mkldnn::memory::primitive_desc pd, mkldnn_memory_format_t format) { - mkldnn::memory::dims dims(pd.desc().data.ndims); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = pd.desc().data.dims[i]; - mkldnn::memory::format cpp_format = static_cast(format); - mkldnn::memory::data_type cpp_type = static_cast( - pd.desc().data.data_type); - mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - return mkldnn::memory::primitive_desc(data_md, pd.get_engine()); -} - static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, bool submit_now = true) { auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index a4adb275e7f7..48d25022231d 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -285,6 +285,11 @@ const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1); +mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc); +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format); + + } // namespace mxnet #endif #endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 566379e3ad91..1cf538f5a86e 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -161,6 +161,80 @@ const mkldnn::memory *GetWeights(const NDArray &arr, } } +mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { + if (desc.data.ndims == 1) { + return desc.data.format; + } else if (desc.data.ndims == 2) { + if (desc.data.format == mkldnn_io) + return mkldnn_oi; + else + return desc.data.format; + } else if (desc.data.ndims == 4) { + switch (desc.data.format) { + case mkldnn_nchw: + case mkldnn_nhwc: + case mkldnn_chwn: + case mkldnn_nChw8c: + case mkldnn_nChw16c: + return mkldnn_nchw; + case mkldnn_oihw: + case mkldnn_ihwo: + case mkldnn_hwio: + case mkldnn_OIhw8i8o: + case mkldnn_OIhw16i16o: + case mkldnn_OIhw8i16o2i: + case mkldnn_OIhw8o16i2o: + case mkldnn_OIhw8o8i: + case mkldnn_OIhw16o16i: + case mkldnn_IOhw16o16i: + case mkldnn_Oihw8o: + case mkldnn_Oihw16o: + case mkldnn_Ohwi8o: + case mkldnn_Ohwi16o: + case mkldnn_OhIw16o4i: + return mkldnn_oihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } else if (desc.data.ndims == 5) { + switch (desc.data.format) { + case mkldnn_goihw: + case mkldnn_gOIhw8i8o: + case mkldnn_gOIhw16i16o: + case mkldnn_gOIhw8i16o2i: + case mkldnn_gOIhw8o16i2o: + case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw16o16i: + case mkldnn_gIOhw16o16i: + case mkldnn_gOihw8o: + case mkldnn_gOihw16o: + case mkldnn_gOhwi8o: + case mkldnn_gOhwi16o: + case mkldnn_gOhIw16o4i: + return mkldnn_goihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } else { + LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; + return mkldnn_format_undef; + } +} + +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format) { + mkldnn::memory::dims dims(pd.desc().data.ndims); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast(format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + return mkldnn::memory::primitive_desc(data_md, pd.get_engine()); +} + } // namespace mxnet #endif diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index f09e15c90f46..c9b8f1e25a68 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -41,30 +41,13 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU); CHECK(src.shape() == dns->shape_); CHECK_EQ(src.dtype(), dns->type_flag_); - // This converts the source data to the default format and copy the data to - // the destination. + // This converts the source data to the default format and write the data to + // the destination directly. std::vector net; auto src_mkldnn = src.GetMKLDNNData(); auto src_pd = src_mkldnn->get_primitive_desc(); - mkldnn::memory::dims dims(dns->shape_.ndim()); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = dns->shape_[i]; - mkldnn::memory::format format = mkldnn::memory::format::format_undef; - switch (dims.size()) { - case 1: format = mkldnn::memory::format::x; break; - case 2: format = mkldnn::memory::format::nc; break; - case 4: format = mkldnn::memory::format::nchw; break; - // This isn't the right layout when the data has 5 dimensions in MXNet. - // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have - // a corresponding format. - case 5: format = mkldnn::memory::format::goihw; break; - } - CHECK_NE(format, mkldnn::memory::format::format_undef); - mkldnn::memory::format cpp_format = static_cast(format); - mkldnn::memory::data_type cpp_type = static_cast( - src_pd.desc().data.data_type); - mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - mkldnn::memory::primitive_desc dst_pd(data_md, src_pd.get_engine()); + auto def_format = GetDefaultFormat(src_pd.desc()); + auto dst_pd = GetPrimitiveDesc(src_pd, def_format); mkldnn::memory dst_mkldnn(dst_pd, dns->dptr_); net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); From aedcfd6e7eae6f72bf2c0998eb92eeaa4e2ded51 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 13 Dec 2017 19:32:55 +0000 Subject: [PATCH 181/264] Handle sliced MKLDNN NDArray. --- src/ndarray/ndarray.cc | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 2f91040f31de..51e37a7f4e46 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -193,6 +193,8 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { } else if (storage_type() == kMKLDNNStorage) { NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); CHECK(ptr_->Mkl_mem_ != nullptr); + // This doesn't work on sliced NDArray yet. + CHECK_EQ(byte_offset_, 0); // We shouldn't submit the reorder primitive here because submit will // be called in operators. auto format = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc()); @@ -239,6 +241,8 @@ NDArray NDArray::Reshape(const TShape &shape) const { } else { ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; } + // We should make sure slice still works. + ret.byte_offset_ = this->byte_offset_; } }, ctx(), {this->var()}, {ret.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); @@ -485,6 +489,8 @@ const mkldnn::memory *NDArray::GetMKLDNNData( if (ptr_->Mkl_mem_->get_primitive_desc() == desc || (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2))) { + // This doesn't work on sliced NDArray yet. + CHECK_EQ(byte_offset_, 0); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } else { @@ -498,6 +504,8 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } + // This doesn't work on sliced NDArray yet. + CHECK_EQ(byte_offset_, 0); if (ptr_->storage_type == kDefaultStorage) { ptr_->SetMKLMem(shape_, dtype_); } @@ -535,7 +543,33 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { ptr_->SetMKLMem(shape_, dtype_); if (ptr_->Mkl_mem_) { MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); - return ptr_->Mkl_mem_.get(); + if (byte_offset_ > 0) { + // Slice only works on the default layout and Slice() turns an array into + // the default layout. + auto pd = ptr_->Mkl_mem_->get_primitive_desc(); + CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); + void *off_addr = static_cast(ptr_->Mkl_mem_->get_data_handle()) + + byte_offset_; + + // Create the primitive desc for the new mkldnn memory. + mkldnn::memory::dims dims(pd.desc().data.ndims); + // The first dimension has been sliced. + dims[0] = shape()[0]; + for (size_t i = 1; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast( + pd.desc().data.format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine()); + + std::shared_ptr ret(new mkldnn::memory(new_pd, off_addr)); + MKLDNNStream::Get()->RegisterMem(ret); + return ret.get(); + } else { + return ptr_->Mkl_mem_.get(); + } } else { // We don't support converting sparse format. return nullptr; @@ -555,6 +589,8 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { return; } + // This doesn't work on sliced NDArray yet. + CHECK_EQ(byte_offset_, 0); MKLDNNStream *stream = MKLDNNStream::Get(); ptr_->SetMKLMem(shape_, dtype_); stream->RegisterMem(ptr_->Mkl_mem_); From b35b74d196d8089edf4d531889f0d288541ebf9b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 14 Dec 2017 01:11:35 +0000 Subject: [PATCH 182/264] Use memcpy if NDArray uses default format. --- src/operator/tensor/cast_storage.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index c9b8f1e25a68..2e12f561e697 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -47,10 +47,15 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) auto src_mkldnn = src.GetMKLDNNData(); auto src_pd = src_mkldnn->get_primitive_desc(); auto def_format = GetDefaultFormat(src_pd.desc()); - auto dst_pd = GetPrimitiveDesc(src_pd, def_format); - mkldnn::memory dst_mkldnn(dst_pd, dns->dptr_); - net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + if (def_format != src_pd.desc().data.format) { + auto dst_pd = GetPrimitiveDesc(src_pd, def_format); + mkldnn::memory dst_mkldnn(dst_pd, dns->dptr_); + net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + } else { + const TBlob &src_blob = src.data(); + memcpy(dns->dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns->type_flag_)); + } } void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { From 2ea3ee22551825acc1c0f5536b7f1f44afefc36f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 14 Dec 2017 19:36:29 +0000 Subject: [PATCH 183/264] Revert "Limit MKLDNN ops being used." This reverts commit 75e2ae570d03483868ec4ed8ed46015c7fa6c6fb. --- src/operator/nn/activation.cc | 10 ++-------- src/operator/nn/convolution.cc | 22 ++++++---------------- src/operator/nn/deconvolution.cc | 14 ++------------ src/operator/nn/fully_connected.cc | 10 ++-------- src/operator/nn/pooling.cc | 12 ++---------- 5 files changed, 14 insertions(+), 54 deletions(-) diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index b137de9337fc..b5386babc610 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -98,10 +98,7 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param) - // There is no reason to use MKLDNN activation if the input isn't in - // MKLDNN format. - && in_attrs->at(0) == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -124,10 +121,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param) - // There is no reason to use MKLDNN activation if the input isn't in - // MKLDNN format. - && (in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index d50b9747eff0..ae0fd149501f 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -293,22 +293,17 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs, } inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { const ConvolutionParam& param = nnvm::get(attrs.parsed); uint32_t in_expected = param.no_bias ? 2 : 3; CHECK_EQ(in_attrs->size(), in_expected); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - // We should allow MKLDNN conv to apply to the default storage as well. - // Even with format conversion, MKLDNN conv should still be faster than - // the native implementation. - && (in_attrs->at(0) == kMKLDNNStorage - || in_attrs->at(0) == kDefaultStorage)) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -331,12 +326,7 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - // We should allow MKLDNN conv to apply to the default storage as well. - // Even with format conversion, MKLDNN conv should still be faster than - // the native implementation. - && (in_attrs->at(0) == kMKLDNNStorage - || in_attrs->at(0) == kDefaultStorage)) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[conv::kData] = kMKLDNNStorage; // We don't want the parameter gradients are stored in MKLDNN storage. diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 086f359931b2..1c6323e42e91 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -267,12 +267,7 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - // We should allow MKLDNN conv to apply to the default storage as well. - // Even with format conversion, MKLDNN conv should still be faster than - // the native implementation. - && (in_attrs->at(0) == kMKLDNNStorage - || in_attrs->at(0) == kDefaultStorage)) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -294,12 +289,7 @@ inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - // We should allow MKLDNN conv to apply to the default storage as well. - // Even with format conversion, MKLDNN conv should still be faster than - // the native implementation. - && (in_attrs->at(0) == kMKLDNNStorage - || in_attrs->at(0) == kDefaultStorage)) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[deconv::kData] = kMKLDNNStorage; // We don't want the parameter gradients are stored in MKLDNN storage. diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 1770c459b1c8..4f7e425593b3 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -138,10 +138,7 @@ inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - // The native implementation uses BLAS. It shouldn't be slower than MKLDNN - // FC. If the input data has the default format, there is format conversion - // overhead as well. - if (dev_mask == mshadow::cpu::kDevMask && in_attrs->at(0) == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -163,10 +160,7 @@ inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - // The native implementation uses BLAS. It shouldn't be slower than MKLDNN - // FC. If the input data has the default format, there is format conversion - // overhead as well. - if (dev_mask == mshadow::cpu::kDevMask && in_attrs->at(0) == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[fullc::kData] = kMKLDNNStorage; // We don't want the parameter gradients are stored in MKLDNN storage. diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 47dfb771114a..845260e0af9d 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -306,12 +306,7 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); - auto expected = MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; - CHECK_EQ(out_attrs->size(), expected); - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) - // There is no reason to use MKLDNN pooling if the input isn't in - // MKLDNN format. - && in_attrs->at(0) == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; @@ -336,10 +331,7 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param) - // There is no reason to use MKLDNN pooling if the input isn't in - // MKLDNN format. - && in_attrs->at(0) == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kMKLDNNStorage; From 025be146deb05622e16994816aff8babef8a16a6 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 14 Dec 2017 19:36:54 +0000 Subject: [PATCH 184/264] Enable mkldnn act backward has the same input layout. --- src/operator/nn/mkldnn/mkldnn_act-inl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index 46711f87a796..7a9104ceca35 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -104,6 +104,10 @@ void MKLDNNActivationBackward(const OpContext &ctx, const ActivationParam& param auto diff_dst_memory = out_grad.GetMKLDNNData(); auto input_mem = in_data.GetMKLDNNData(); + // We need to make sure the two inputs to eltwise_backward has the same memory + // descriptor. Otherwise, the perf will suffer. + if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc()) + input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc()); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); From 74f8d75a8820baef169e1cc25681ee4ce08dc5a8 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 14 Dec 2017 22:05:48 +0000 Subject: [PATCH 185/264] Fix a bug in mkldnn activation. --- src/operator/nn/mkldnn/mkldnn_act-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index 7a9104ceca35..664a27c99560 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -102,6 +102,7 @@ void MKLDNNActivationBackward(const OpContext &ctx, const ActivationParam& param return; } + TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); auto diff_dst_memory = out_grad.GetMKLDNNData(); auto input_mem = in_data.GetMKLDNNData(); // We need to make sure the two inputs to eltwise_backward has the same memory @@ -113,7 +114,6 @@ void MKLDNNActivationBackward(const OpContext &ctx, const ActivationParam& param mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; - TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, From aff0541891f06557959c9d33dfaebceee200a340 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 15 Dec 2017 19:35:57 +0000 Subject: [PATCH 186/264] Use MKLDNN sum in more cases. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 7 +++++- .../tensor/elemwise_binary_op_basic.cc | 6 ++--- src/operator/tensor/elemwise_sum.cc | 25 ++++++++++++++++--- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 48d25022231d..f14030973736 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -122,13 +122,18 @@ static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { return support; } +static inline bool SupportStorageMKLDNN(int stype) { + return stype == kMKLDNNStorage || stype == kDefaultStorage; +} + static inline bool SupportMKLDNN(int dtype, const TShape &shape) { int ndim = shape.ndim(); return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); } static inline bool SupportMKLDNN(const NDArray &input) { - return SupportMKLDNN(input.dtype(), input.shape()); + return SupportMKLDNN(input.dtype(), input.shape()) + && SupportStorageMKLDNN(input.storage_type()); } static inline bool SupportMKLDNNConv(const NDArray &input) { diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 1c5ff0ec91d5..ae143684a1d8 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -25,6 +25,7 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -37,8 +38,7 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (inputs[0].storage_type() == kMKLDNNStorage - || inputs[1].storage_type() == kMKLDNNStorage) { + if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); return; } else if (inputs[0].storage_type() == kDefaultStorage @@ -68,7 +68,7 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->size(), 2); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if ((in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage) + if ((SupportStorageMKLDNN(in_attrs->at(0)) || SupportStorageMKLDNN(in_attrs->at(1))) && dev_mask == mshadow::cpu::kDevMask) { out_attrs->at(0) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index ed12917594e2..1ee2c9a4235c 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -25,6 +25,7 @@ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" #include "../../common/utils.h" namespace mxnet { @@ -74,6 +75,25 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } +#if MXNET_USE_MKLDNN == 1 +static inline bool SupportMKLDNN(const std::vector& inputs) { + for (auto &i : inputs) { + if (!SupportMKLDNN(i)) + return false; + } + return true; +} + +static inline bool SupportStorageMKLDNN(const std::vector &inputs) { + for (int i : inputs) { + if (!mxnet::SupportStorageMKLDNN(i)) + return false; + } + return true; +} + +#endif + bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, @@ -82,8 +102,7 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { + if (dev_mask == mshadow::cpu::kDevMask && SupportStorageMKLDNN(*in_attrs)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -110,7 +129,7 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 - } else if (common::ContainsStorage(inputs, kMKLDNNStorage)) { + } else if (SupportMKLDNN(inputs)) { MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { From 0f98ce0984ea55e665660e75d38aa69f5a0ebdba Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 16 Dec 2017 01:05:06 +0000 Subject: [PATCH 187/264] Improve perf of reorder. --- src/ndarray/ndarray.cc | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 51e37a7f4e46..5706bb3d4d85 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -628,10 +628,36 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle())); stream->RegisterMem(tmp_mem); stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); - } else { + } else if (mem.get_primitive_desc() == ptr_->Mkl_mem_->get_primitive_desc()) { + // If the layout is the same, we can just copy data. stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + } else { + auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc()); + auto dst_def = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc()); + // If both are not using the default layouts. There isn't much we can do, + // other than reorder data layout directly. + if (dst_def != ptr_->Mkl_mem_->get_primitive_desc().desc().data.format + && src_def != mem.get_primitive_desc().desc().data.format) { + stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_)); + } else if (dst_def == ptr_->Mkl_mem_->get_primitive_desc().desc().data.format) { + // If the dest mem uses the default memory layout, we can simply use + // the default format of the source memory to improve perf of reorder. + auto pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), src_def); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->Mkl_mem_->get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem)); + } else { + // If the src mem uses the default memory layout, we can use + // the default format of the source memory to improve perf. + auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_)); + } } } +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format); mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { mkldnn::memory::primitive_desc _desc = desc; From aa7afd407db84a500fe1232bd9daa7a02e681d79 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 16 Dec 2017 02:07:24 +0000 Subject: [PATCH 188/264] Avoid memory reorder in conv and deconv. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 14 +++++++------- src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 9 +++++---- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index c2b01c61da5c..b88d6823d8ac 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -205,12 +205,12 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; + mkldnn::convolution_backward_data::primitive_desc bwdData_pd + = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdData_pd.diff_dst_primitive_desc()); if (req[conv::kData]) { - mkldnn::convolution_backward_data::primitive_desc bwdData_pd - = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], - inputs[conv::kOut], fwd_pd); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( - bwdData_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(inputs[conv::kWeight + 1], bwdData_pd.weights_primitive_desc(), param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], @@ -224,8 +224,8 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( - bwdWeights_pd.diff_dst_primitive_desc()); + if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc()) + out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index d0605e3d448b..f306adcf4bd6 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -195,9 +195,9 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false, inputs[deconv::kOut]); + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdData_pd.src_primitive_desc()); if (req[deconv::kData]) { - auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( - bwdData_pd.src_primitive_desc()); auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], bwdData_pd.weights_primitive_desc(), param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], @@ -210,8 +210,9 @@ void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd); - auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( - bwdWeights_pd.src_primitive_desc()); + if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc()) + out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.diff_dst_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], From f35fa37bc9dd57a1ec0276a79b4df2140097430a Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 16 Dec 2017 22:26:52 +0000 Subject: [PATCH 189/264] Avoid unnecessary storage cast in fallback path. --- include/mxnet/ndarray.h | 1 + src/common/exec_utils.h | 7 ++++++- src/ndarray/ndarray.cc | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index e5e9d753ab61..93c47788e92d 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -551,6 +551,7 @@ class NDArray { } #if MXNET_USE_MKLDNN == 1 + bool IsMKLDNNDefault() const; /* * All functions below return a raw pointer to mkldnn memory. Actually there * is a shared pointer that hold the memory either in NDArray or in MKLDNN diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index dcd1504fb88e..4b90dd81b157 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -51,7 +51,12 @@ inline bool SetupDefaultBlobs(const std::vector& src, bool require_cast = false; for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; - if (nd.storage_type() != kDefaultStorage) { + bool is_default = nd.storage_type() == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + // If this is mkldnn storage and it uses the default layout. + is_default = is_default || nd.IsMKLDNNDefault(); +#endif + if (!is_default) { if (idx_map != nullptr) { (*idx_map)[i] = temp_dst->size(); } diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 5706bb3d4d85..7e73492cae66 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -402,6 +402,17 @@ static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims return true; } +bool NDArray::IsMKLDNNDefault() const { + // If we don't have mkldnn memory yet, we just assume it's not the default + // format. + if (storage_type() == kMKLDNNStorage && ptr_->Mkl_mem_ != nullptr) { + auto desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); + return desc.data.format == GetDefaultFormat(desc); + } else { + return false; + } +} + void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data From 449b537b8fba97e8192871fe940695254ad3e344 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 16 Dec 2017 23:06:56 +0000 Subject: [PATCH 190/264] Revert "Use MKLDNN sum in more cases." This reverts commit 7a21ebca8bbe17fde49c3b1ca3f31b835a33afb8. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 7 +----- .../tensor/elemwise_binary_op_basic.cc | 6 ++--- src/operator/tensor/elemwise_sum.cc | 25 +++---------------- 3 files changed, 7 insertions(+), 31 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index f14030973736..48d25022231d 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -122,18 +122,13 @@ static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { return support; } -static inline bool SupportStorageMKLDNN(int stype) { - return stype == kMKLDNNStorage || stype == kDefaultStorage; -} - static inline bool SupportMKLDNN(int dtype, const TShape &shape) { int ndim = shape.ndim(); return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); } static inline bool SupportMKLDNN(const NDArray &input) { - return SupportMKLDNN(input.dtype(), input.shape()) - && SupportStorageMKLDNN(input.storage_type()); + return SupportMKLDNN(input.dtype(), input.shape()); } static inline bool SupportMKLDNNConv(const NDArray &input) { diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index ae143684a1d8..1c5ff0ec91d5 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -25,7 +25,6 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" -#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -38,7 +37,8 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { + if (inputs[0].storage_type() == kMKLDNNStorage + || inputs[1].storage_type() == kMKLDNNStorage) { MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); return; } else if (inputs[0].storage_type() == kDefaultStorage @@ -68,7 +68,7 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->size(), 2); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if ((SupportStorageMKLDNN(in_attrs->at(0)) || SupportStorageMKLDNN(in_attrs->at(1))) + if ((in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage) && dev_mask == mshadow::cpu::kDevMask) { out_attrs->at(0) = kMKLDNNStorage; *dispatch_mode = DispatchMode::kFComputeEx; diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 1ee2c9a4235c..ed12917594e2 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -25,7 +25,6 @@ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" -#include "../nn/mkldnn/mkldnn_base-inl.h" #include "../../common/utils.h" namespace mxnet { @@ -75,25 +74,6 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs, attrs, in_attrs, out_attrs, -1); } -#if MXNET_USE_MKLDNN == 1 -static inline bool SupportMKLDNN(const std::vector& inputs) { - for (auto &i : inputs) { - if (!SupportMKLDNN(i)) - return false; - } - return true; -} - -static inline bool SupportStorageMKLDNN(const std::vector &inputs) { - for (int i : inputs) { - if (!mxnet::SupportStorageMKLDNN(i)) - return false; - } - return true; -} - -#endif - bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, DispatchMode* dispatch_mode, @@ -102,7 +82,8 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportStorageMKLDNN(*in_attrs)) { + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { *dispatch_mode = DispatchMode::kFComputeEx; (*out_attrs)[0] = kMKLDNNStorage; return true; @@ -129,7 +110,7 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 - } else if (SupportMKLDNN(inputs)) { + } else if (common::ContainsStorage(inputs, kMKLDNNStorage)) { MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { From db33412c065d4cdf2104de5aca5b93ed7a5fa73b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 05:56:21 +0000 Subject: [PATCH 191/264] Handle sliced ndarray in more cases. --- src/ndarray/ndarray.cc | 108 +++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 59 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 7e73492cae66..2edc6d041aa0 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -402,6 +402,11 @@ static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims return true; } +static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) { + return same_shape(shape, desc.data.dims, desc.data.ndims) + && get_mkldnn_type(dtype) == desc.data.data_type; +} + bool NDArray::IsMKLDNNDefault() const { // If we don't have mkldnn memory yet, we just assume it's not the default // format. @@ -420,6 +425,9 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // TODO(zhengda) is it possible that the MKL memory is out-of-date? if (Mkl_mem_ && storage_type == kMKLDNNStorage) { return; + } else if (Mkl_mem_ && Mkl_mem_->get_data_handle() == shandle.dptr + && same_shape(shape, dtype, Mkl_mem_->get_primitive_desc().desc())) { + return; } mkldnn::memory::dims dims; @@ -470,10 +478,10 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { * the formats are different. I need to double check its format. */ static inline mkldnn::memory *GetMKLDNNExact( - mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) { + const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) { auto src_desc = mem->get_primitive_desc(); if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) { - return mem; + return const_cast(mem); } else { std::shared_ptr ret(new mkldnn::memory( desc, mem->get_data_handle())); @@ -488,21 +496,15 @@ const mkldnn::memory *NDArray::GetMKLDNNData( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } - if (ptr_->storage_type == kDefaultStorage) { - ptr_->SetMKLMem(shape_, dtype_); - } - CHECK(ptr_->Mkl_mem_ != nullptr); + auto mem = GetMKLDNNData(); mkldnn::memory::primitive_desc _desc = desc; - auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + auto desc1 = mem->get_primitive_desc().desc(); auto desc2 = _desc.desc(); // The MKL memory has the same format and shape as required, // or both use the default format, we can return the MKL memory. - if (ptr_->Mkl_mem_->get_primitive_desc() == desc + if (mem->get_primitive_desc() == desc || (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2))) { - // This doesn't work on sliced NDArray yet. - CHECK_EQ(byte_offset_, 0); - MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } else { return nullptr; @@ -515,36 +517,28 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } - // This doesn't work on sliced NDArray yet. - CHECK_EQ(byte_offset_, 0); - if (ptr_->storage_type == kDefaultStorage) { - ptr_->SetMKLMem(shape_, dtype_); - } - // If the array uses the default format, the MKL memory now references to - // the default storage. If it uses the MKLDNN format, the MKL memory should - // have been initialized since we are trying to get data from the array. - CHECK(ptr_->Mkl_mem_ != nullptr); + CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); + + auto mem = GetMKLDNNData(); // If the memory descriptor matches, it's easy. MKLDNNStream *stream = MKLDNNStream::Get(); - // We need to make sure Mkl_mem_ is always valid as well. - stream->RegisterMem(ptr_->Mkl_mem_); - if (ptr_->Mkl_mem_->get_primitive_desc() == desc) { - return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); + if (mem->get_primitive_desc() == desc) { + return GetMKLDNNExact(mem, desc); } mkldnn::memory::primitive_desc _desc = desc; // Now we need to determine if we should reorder the memory. - // If both use the default formats, we think we don't need to reshape. - auto desc1 = ptr_->Mkl_mem_->get_primitive_desc().desc(); + // If both use the default formats, we think we don't need to reorder. + auto desc1 = mem->get_primitive_desc().desc(); auto desc2 = _desc.desc(); if (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2)) { - mkldnn_mem_ptr ret(new mkldnn::memory(desc, ptr_->Mkl_mem_->get_data_handle())); + mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle())); stream->RegisterMem(ret); return ret.get(); } else { auto ret = TmpMemMgr::Get()->Alloc(desc); - stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *ret)); + stream->RegisterPrim(mkldnn::reorder(*mem, *ret)); return ret; } } @@ -552,38 +546,34 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( const mkldnn::memory *NDArray::GetMKLDNNData() const { CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); ptr_->SetMKLMem(shape_, dtype_); - if (ptr_->Mkl_mem_) { - MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); - if (byte_offset_ > 0) { - // Slice only works on the default layout and Slice() turns an array into - // the default layout. - auto pd = ptr_->Mkl_mem_->get_primitive_desc(); - CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); - void *off_addr = static_cast(ptr_->Mkl_mem_->get_data_handle()) - + byte_offset_; - - // Create the primitive desc for the new mkldnn memory. - mkldnn::memory::dims dims(pd.desc().data.ndims); - // The first dimension has been sliced. - dims[0] = shape()[0]; - for (size_t i = 1; i < dims.size(); i++) - dims[i] = pd.desc().data.dims[i]; - mkldnn::memory::format cpp_format = static_cast( - pd.desc().data.format); - mkldnn::memory::data_type cpp_type = static_cast( - pd.desc().data.data_type); - mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine()); - - std::shared_ptr ret(new mkldnn::memory(new_pd, off_addr)); - MKLDNNStream::Get()->RegisterMem(ret); - return ret.get(); - } else { - return ptr_->Mkl_mem_.get(); - } + CHECK(ptr_->Mkl_mem_ != nullptr); + MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); + if (byte_offset_ > 0) { + // Slice only works on the default layout and Slice() turns an array into + // the default layout. + auto pd = ptr_->Mkl_mem_->get_primitive_desc(); + CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); + void *off_addr = static_cast(ptr_->Mkl_mem_->get_data_handle()) + + byte_offset_; + + // Create the primitive desc for the new mkldnn memory. + mkldnn::memory::dims dims(pd.desc().data.ndims); + // The first dimension has been sliced. + dims[0] = shape()[0]; + for (size_t i = 1; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast( + pd.desc().data.format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine()); + + std::shared_ptr ret(new mkldnn::memory(new_pd, off_addr)); + MKLDNNStream::Get()->RegisterMem(ret); + return ret.get(); } else { - // We don't support converting sparse format. - return nullptr; + return ptr_->Mkl_mem_.get(); } } From b9f913dfa9002fd4919cc19c82913abee7bd9a17 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 18:08:25 +0000 Subject: [PATCH 192/264] Fix a complain from make lint. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index b88d6823d8ac..c74ac04057d2 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -225,7 +225,8 @@ void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &c param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut], fwd_pd); if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc()) - out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(bwdWeights_pd.diff_dst_primitive_desc()); + out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], From 0f9f75aca51f14f0df68cbfd2706aa001cea2be9 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 18:17:59 +0000 Subject: [PATCH 193/264] Update Jenkins to test MKLDNN. --- Jenkinsfile | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 731e288372c7..94d56e04ba3f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -150,42 +150,40 @@ try { } } }, - 'CPU: MKLML': { + 'CPU: MKLDNN': { node('mxnetlinux-cpu') { - ws('workspace/build-mklml-cpu') { + ws('workspace/build-mkldnn-cpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKL2017=1 \ - USE_MKL2017_EXPERIMENTAL=1 \ + USE_MKLDNN=1 \ -j\$(nproc) """ - make("cpu_mklml", flag) - pack_lib('mklml_cpu') + make("cpu_mkldnn", flag) + pack_lib('mkldnn_cpu') } } }, - 'GPU: MKLML': { + 'GPU: MKLDNN': { node('mxnetlinux-cpu') { - ws('workspace/build-mklml-gpu') { + ws('workspace/build-mkldnn-gpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKL2017=1 \ - USE_MKL2017_EXPERIMENTAL=1 \ + USE_MKLDNN=1 \ USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ USE_CUDNN=1 \ -j\$(nproc) """ make("build_cuda", flag) - pack_lib('mklml_gpu') + pack_lib('mkldnn_gpu') } } }, @@ -328,39 +326,39 @@ try { } } }, - 'Python2: MKLML-CPU': { + 'Python2: MKLDNN-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python2-mklml-cpu') { + ws('workspace/ut-python2-mkldnn-cpu') { init_git() - unpack_lib('mklml_cpu') - python2_ut('cpu_mklml') + unpack_lib('mkldnn_cpu') + python2_ut('cpu_mkldnn') } } }, - 'Python2: MKLML-GPU': { + 'Python2: MKLDNN-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python2-mklml-gpu') { + ws('workspace/ut-python2-mkldnn-gpu') { init_git() - unpack_lib('mklml_gpu') - python2_gpu_ut('gpu_mklml') + unpack_lib('mkldnn_gpu') + python2_gpu_ut('gpu_mkldnn') } } }, - 'Python3: MKLML-CPU': { + 'Python3: MKLDNN-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python3-mklml-cpu') { + ws('workspace/ut-python3-mkldnn-cpu') { init_git() - unpack_lib('mklml_cpu') - python3_ut('cpu_mklml') + unpack_lib('mkldnn_cpu') + python3_ut('cpu_mkldnn') } } }, - 'Python3: MKLML-GPU': { + 'Python3: MKLDNN-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python3-mklml-gpu') { + ws('workspace/ut-python3-mkldnn-gpu') { init_git() - unpack_lib('mklml_gpu') - python3_gpu_ut('gpu_mklml') + unpack_lib('mkldnn_gpu') + python3_gpu_ut('gpu_mkldnn') } } }, From ce8742594ee92de4ac71608e30d127062f9c9cb7 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 18:31:21 +0000 Subject: [PATCH 194/264] Add Dockerfile for CI. --- tests/ci_build/Dockerfile.cpu_mkldnn | 18 ++++++++++++++++++ tests/ci_build/Dockerfile.gpu_mkldnn | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 tests/ci_build/Dockerfile.cpu_mkldnn create mode 100644 tests/ci_build/Dockerfile.gpu_mkldnn diff --git a/tests/ci_build/Dockerfile.cpu_mkldnn b/tests/ci_build/Dockerfile.cpu_mkldnn new file mode 100644 index 000000000000..591f11365014 --- /dev/null +++ b/tests/ci_build/Dockerfile.cpu_mkldnn @@ -0,0 +1,18 @@ +FROM ubuntu:16.04 + +COPY install/ubuntu_install_core.sh /install/ +RUN /install/ubuntu_install_core.sh +COPY install/ubuntu_install_python.sh /install/ +RUN /install/ubuntu_install_python.sh +COPY install/ubuntu_install_scala.sh /install/ +RUN /install/ubuntu_install_scala.sh +COPY install/ubuntu_install_r.sh /install/ +RUN /install/ubuntu_install_r.sh +COPY install/ubuntu_install_perl.sh /install/ +RUN /install/ubuntu_install_perl.sh + +# Add MKLML library, compatiable with Ubuntu16.04 +#RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz +#RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_* + +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib diff --git a/tests/ci_build/Dockerfile.gpu_mkldnn b/tests/ci_build/Dockerfile.gpu_mkldnn new file mode 100644 index 000000000000..0b6bd2e44505 --- /dev/null +++ b/tests/ci_build/Dockerfile.gpu_mkldnn @@ -0,0 +1,18 @@ +FROM nvidia/cuda:8.0-cudnn5-devel +# cuda8.0 has to be used because this is the first ubuntu16.04 container +# # which is required due to OpenBLAS being incompatible with ubuntu14.04 +# the reason we used a gpu base container because we are going to test MKLDNN +# operator implementation against GPU implementation + +COPY install/ubuntu_install_core.sh /install/ +RUN /install/ubuntu_install_core.sh +COPY install/ubuntu_install_python.sh /install/ +RUN /install/ubuntu_install_python.sh +COPY install/ubuntu_install_scala.sh /install/ +RUN /install/ubuntu_install_scala.sh + +# Add MKLML library, compatible with Ubuntu16.04 +#RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz +#RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_* + +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib From 3c4125774099b7e92160a9a860cc07245f3d13f2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 18:50:18 +0000 Subject: [PATCH 195/264] debug compiling mkldnn. --- prepare_mkldnn.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh index 7a4fe4ce5207..10334b11e423 100755 --- a/prepare_mkldnn.sh +++ b/prepare_mkldnn.sh @@ -92,8 +92,9 @@ if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then fi echo "Building MKLDNN ..." >&2 cd $MXNET_ROOTDIR + g++ --version cmake $MKLDNN_SRCDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR - make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) + make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 make -C $MKLDNN_BUILDDIR install rm -rf $MKLDNN_BUILDDIR fi From 79aa7f3090ce1db434d5e695c37dfb6199898a83 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 19:22:20 +0000 Subject: [PATCH 196/264] Use MKLDNN sum in more cases. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 7 ++++++- src/operator/tensor/elemwise_binary_op_basic.cc | 16 +++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 48d25022231d..5b3842604ed9 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -122,13 +122,18 @@ static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { return support; } +static inline bool SupportStorageMKLDNN(int stype) { + return stype == kMKLDNNStorage || stype == kDefaultStorage; +} + static inline bool SupportMKLDNN(int dtype, const TShape &shape) { int ndim = shape.ndim(); return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); } static inline bool SupportMKLDNN(const NDArray &input) { - return SupportMKLDNN(input.dtype(), input.shape()); + return SupportMKLDNN(input.dtype(), input.shape()) + && SupportStorageMKLDNN(input.storage_type()); } static inline bool SupportMKLDNNConv(const NDArray &input) { diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 1c5ff0ec91d5..78955275d04f 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -25,6 +25,7 @@ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" #include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -37,8 +38,7 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 2U); CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 - if (inputs[0].storage_type() == kMKLDNNStorage - || inputs[1].storage_type() == kMKLDNNStorage) { + if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); return; } else if (inputs[0].storage_type() == kDefaultStorage @@ -68,9 +68,15 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->size(), 2); CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if ((in_attrs->at(0) == kMKLDNNStorage || in_attrs->at(1) == kMKLDNNStorage) - && dev_mask == mshadow::cpu::kDevMask) { - out_attrs->at(0) = kMKLDNNStorage; + // If both inputs can be used by MKLDNN, we want to use MKLDNN. + auto support_mkldnn = SupportStorageMKLDNN(in_attrs->at(0)) + && SupportStorageMKLDNN(in_attrs->at(1)); + if (support_mkldnn && dev_mask == mshadow::cpu::kDevMask) { + // However, we only want the output uses mkldnn storage if one of the inputs + // is in mkldnn storage. + auto has_mkldnn = in_attrs->at(0) == kMKLDNNStorage + || in_attrs->at(1) == kMKLDNNStorage; + out_attrs->at(0) = has_mkldnn ? kMKLDNNStorage : kDefaultStorage; *dispatch_mode = DispatchMode::kFComputeEx; return true; } From e7783f64f4803fcaa0adaa0b405f71768167fc68 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 21:57:24 +0000 Subject: [PATCH 197/264] Add mkldnn as a submodule. --- .gitmodules | 4 ++++ 3rdparty/mkldnn | 1 + 2 files changed, 5 insertions(+) create mode 160000 3rdparty/mkldnn diff --git a/.gitmodules b/.gitmodules index 170c105a6f48..eec87413d859 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,3 +22,7 @@ [submodule "3rdparty/googletest"] path = 3rdparty/googletest url = https://github.com/google/googletest.git +[submodule "3rdparty/mkldnn"] + path = 3rdparty/mkldnn + url = https://github.com/01org/mkl-dnn.git + branch = master diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn new file mode 160000 index 000000000000..82cf37f626a0 --- /dev/null +++ b/3rdparty/mkldnn @@ -0,0 +1 @@ +Subproject commit 82cf37f626a0998d38f99c30e5a08a0dd5e49bc0 From 66a2557662846e1e39d49b3ca5944ff21b74520e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 17 Dec 2017 22:27:13 +0000 Subject: [PATCH 198/264] Compile with mkldnn in 3rdparty. --- prepare_mkldnn.sh | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh index 10334b11e423..8dbd7a87b6df 100755 --- a/prepare_mkldnn.sh +++ b/prepare_mkldnn.sh @@ -56,16 +56,11 @@ # MXNET_ROOTDIR="$(pwd)" -MKLDNN_ROOTDIR="$MXNET_ROOTDIR/external/mkldnn" -MKLDNN_GITHUB="https://github.com/01org/mkl-dnn.git" -MKLDNN_TMPDIR="$MKLDNN_ROOTDIR/tmp" +MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/" MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" -# MKL DNN release tag, or commit. -MKLDNN_COMMIT="v0.11" - # MKLDNN install destination HOME_MKLDNN=$1 if [ ! -z "$HOME_MKLDNN" ]; then @@ -79,22 +74,16 @@ fi if [ -z $MKLDNNROOT ]; then if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then mkdir -p $MKLDNN_INSTALLDIR - if [ ! -d $MKLDNN_SRCDIR/.git ]; then - echo "Downloading MKLDNN ..." >&2 - rm -rf $MKLDNN_SRCDIR - git clone --quiet --no-checkout $MKLDNN_GITHUB $MKLDNN_TMPDIR - rsync -a $MKLDNN_TMPDIR/ $MKLDNN_SRCDIR && rm -rf $MKLDNN_TMPDIR - fi - cd $MKLDNN_SRCDIR && git fetch --all && git reset --hard $MKLDNN_COMMIT + cd $MKLDNN_ROOTDIR if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. cp -a external/*/* $MKLDNN_INSTALLDIR/. fi echo "Building MKLDNN ..." >&2 cd $MXNET_ROOTDIR - g++ --version - cmake $MKLDNN_SRCDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR - make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 + g++ --version >&2 + cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR + make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2 make -C $MKLDNN_BUILDDIR install rm -rf $MKLDNN_BUILDDIR fi From 84469a423db11a171d3ae1b5497ce1d57e805a92 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 05:38:58 +0000 Subject: [PATCH 199/264] Fix some coding styles. --- src/operator/nn/concat.cc | 4 +- src/operator/nn/convolution.cc | 4 +- src/operator/nn/deconvolution.cc | 4 +- src/operator/nn/fully_connected.cc | 4 +- src/operator/nn/mkldnn/mkldnn_concat.cc | 14 ++-- src/operator/nn/mkldnn/mkldnn_convolution.cc | 9 +-- .../nn/mkldnn/mkldnn_deconvolution.cc | 14 ++-- .../nn/mkldnn/mkldnn_fully_connected.cc | 14 ++-- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 66 +++++++++++-------- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 14 ++-- src/operator/nn/mkldnn/mkldnn_softmax.cc | 5 +- src/operator/nn/mkldnn/mkldnn_sum.cc | 6 +- src/operator/nn/pooling.cc | 8 +-- src/operator/nn/softmax.cc | 2 +- .../tensor/elemwise_binary_op_basic.cc | 2 +- src/operator/tensor/elemwise_sum.cc | 2 +- 16 files changed, 95 insertions(+), 77 deletions(-) diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index 0828c0f037c9..52ccd234db0f 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -160,7 +160,7 @@ void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, // MKLDNN support 2D and 4D concat if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNConcat_Forward(attrs, op_ctx, inputs, req, outputs); + MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs); } } else { std::vector in_blobs(inputs.size()); @@ -180,7 +180,7 @@ static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, #if MXNET_USE_MKLDNN == 1 if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNConcat_Backward(attrs, ctx, inputs, req, outputs); + MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs); } } else { std::vector in_blobs(1); diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index ae0fd149501f..ad79c01ffbb0 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -53,7 +53,7 @@ static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNNConv(inputs[0])) { - MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs); + MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs); return; } #endif @@ -71,7 +71,7 @@ static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNNConv(inputs[0])) { - MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs); + MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs); return; } #endif diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 1c6323e42e91..71d0139eee6e 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -313,7 +313,7 @@ static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNNConv(inputs[0])) { - MKLDNNDeconvolution_Forward(attrs, ctx, inputs, req, outputs); + MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs); return; } #endif @@ -331,7 +331,7 @@ static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNNConv(inputs[0])) { - MKLDNNDeconvolution_Backward(attrs, ctx, inputs, req, outputs); + MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs); return; } #endif diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 4f7e425593b3..3723a03af8d0 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -78,7 +78,7 @@ void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNFC_Forward(attrs, ctx, inputs, req, outputs); + MKLDNNFCForward(attrs, ctx, inputs, req, outputs); return; } #endif @@ -96,7 +96,7 @@ void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, const std::vector &req, const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNFC_Backward(attrs, ctx, inputs, req, outputs); + MKLDNNFCBackward(attrs, ctx, inputs, req, outputs); return; } #endif diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index 56fb473b4fe0..d3e6e775020d 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -30,9 +30,10 @@ namespace mxnet { namespace op { -void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { +void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; @@ -54,9 +55,10 @@ void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, MKLDNNStream::Get()->Submit(); } -void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs) { +void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); const ConcatParam& param = nnvm::get(attrs.parsed); int num_in_data = param.num_args; diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index c74ac04057d2..0c39d81deacf 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -165,7 +165,7 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( } } -void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, +void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { @@ -193,9 +193,10 @@ void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ct MKLDNNStream::Get()->Submit(); } -void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs) { +void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); const std::vector &in_grad = outputs; auto engine = CpuEngine::Get()->get_engine(); diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index f306adcf4bd6..eda28e3d8cff 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -151,9 +151,10 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( } } -void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const DeconvolutionParam& param = nnvm::get(attrs.parsed); @@ -185,9 +186,10 @@ void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext & } } -void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs) { +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const std::vector &in_grad = outputs; const DeconvolutionParam& param = nnvm::get(attrs.parsed); diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index cbb078f620f6..d82bc1a24c0a 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -80,9 +80,10 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei } } -void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { +void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); const FullyConnectedParam& param = nnvm::get(attrs.parsed); const TShape& ishape = in_data[fullc::kData].shape(); @@ -123,9 +124,10 @@ void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, MKLDNNStream::Get()->Submit(); } -void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const std::vector &req, - const std::vector &outputs) { +void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 1b8733b3aa17..820197efa0bb 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -42,36 +42,44 @@ namespace mxnet { namespace op { /* For fully connected. */ -void MKLDNNFC_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data); -void MKLDNNFC_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const std::vector &req, - const std::vector &outputs); +void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); /* For convolution. */ -void MKLDNNConvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data); -void MKLDNNConvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs); +void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); /* For deconvolution */ -void MKLDNNDeconvolution_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data); -void MKLDNNDeconvolution_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs); +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); /* For softmax */ -void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, const NDArray &out_data); +void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); /* For sum */ -void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const OpReqType &req, const NDArray &out_data); +void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, + const NDArray &out_data); /* For copy */ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, @@ -79,12 +87,14 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &out_data); /* For concat */ -void MKLDNNConcat_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data); -void MKLDNNConcat_Backward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs); +void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, const mkldnn::memory &out); diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 825ff5600219..301517e1369e 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -119,9 +119,9 @@ inline bool MKLDNNRequireWorkspace(const PoolingParam ¶m) { return param.pool_type != pool_enum::kAvgPooling; } -void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data, const NDArray *workspace) { +void MKLDNNPoolingForward(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data, const NDArray *workspace) { auto input_mem = in_data.GetMKLDNNData(); auto data_mpd = input_mem->get_primitive_desc(); auto data_md = data_mpd.desc(); @@ -150,10 +150,10 @@ void MKLDNNPooling_Forward(const OpContext &ctx, const PoolingParam ¶m, MKLDNNStream::Get()->Submit(); } -void MKLDNNPooling_Backward(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &out_grad, const NDArray &in_data, - const NDArray *workspace, const OpReqType &req, - const NDArray &in_grad) { +void MKLDNNPoolingBackward(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &out_grad, const NDArray &in_data, + const NDArray *workspace, const OpReqType &req, + const NDArray &in_grad) { if (req == kNullOp) { return; } diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc index d8a2ab7ce1b8..aa59f13d06da 100644 --- a/src/operator/nn/mkldnn/mkldnn_softmax.cc +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -31,8 +31,9 @@ namespace mxnet { namespace op { -void MKLDNNSoftmax_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { +void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { const SoftmaxParam& param = nnvm::get(attrs.parsed); auto input_mem = in_data.GetMKLDNNData(); mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index cbeb405a429b..a012617a8bef 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -48,9 +48,9 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); } -void MKLDNNSum_Forward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const OpReqType &req, - const NDArray &out_data) { +void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, + const NDArray &out_data) { std::vector in_prims; std::vector in_pds(inputs.size()); std::vector scales(inputs.size()); diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 845260e0af9d..7a774f49de53 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -234,8 +234,8 @@ void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, } if (SupportMKLDNN(inputs[0]) && SupportMKLDNNPooling(param, inputs[0].shape())) { - MKLDNNPooling_Forward(ctx, param, inputs[0], req[0], outputs[0], - workspace); + MKLDNNPoolingForward(ctx, param, inputs[0], req[0], outputs[0], + workspace); return; } #endif @@ -270,8 +270,8 @@ void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const NDArray &in_grad = outputs[0]; if (SupportMKLDNN(inputs[0]) && SupportMKLDNNPooling(param, inputs[0].shape())) { - MKLDNNPooling_Backward(ctx, param, out_grad, *in_data, workspace, - req[0], in_grad); + MKLDNNPoolingBackward(ctx, param, out_grad, *in_data, workspace, + req[0], in_grad); return; } #endif diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 9a5b5e91f1b3..86762adc9a92 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -40,7 +40,7 @@ static void SoftmaxCompute_CPU(const nnvm::NodeAttrs& attrs, // It seems MKLDNN softmax doesn't support training. // and it only supports non-negative axis. if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) { - MKLDNNSoftmax_Forward(attrs, ctx, inputs[0], req[0], outputs[0]); + MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]); return; } #endif diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index 78955275d04f..c36225078275 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -39,7 +39,7 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { - MKLDNNSum_Forward(attrs, ctx, inputs, req[0], outputs[0]); + MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); return; } else if (inputs[0].storage_type() == kDefaultStorage || inputs[1].storage_type() == kDefaultStorage) { diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index ed12917594e2..1b70bbd8f436 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -111,7 +111,7 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 } else if (common::ContainsStorage(inputs, kMKLDNNStorage)) { - MKLDNNSum_Forward(attrs, op_ctx, inputs, req[0], outputs[0]); + MKLDNNSumForward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { // This case happens when we want to create an MKLDNN NDArray but the type From faec6b226b1a573f0273efa78945f9ef912ada45 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 19:24:29 +0000 Subject: [PATCH 200/264] download curl in ci build. --- tests/ci_build/install/ubuntu_install_core.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/tests/ci_build/install/ubuntu_install_core.sh index eefd7590cdbc..71721744859a 100755 --- a/tests/ci_build/install/ubuntu_install_core.sh +++ b/tests/ci_build/install/ubuntu_install_core.sh @@ -21,7 +21,7 @@ apt-get update && apt-get install -y \ build-essential git libopenblas-dev liblapack-dev libopencv-dev \ - libcurl4-openssl-dev libgtest-dev cmake wget unzip sudo + libcurl4-openssl-dev libgtest-dev cmake wget unzip sudo curl # Link Openblas to Cblas as this link does not exist on ubuntu16.04 ln -s /usr/lib/libopenblas.so /usr/lib/libcblas.so From 11f58e4d6188bce4cc2e51c5b955567266ca7f11 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 20:39:47 +0000 Subject: [PATCH 201/264] write the path to mkldnn lib in libmxnet.so. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e49afdd00f88..2fbdbb11955f 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ ifeq ($(USE_MKLDNN), 1) LDFLAGS += -L$(MKLROOT)/lib endif CFLAGS += -I$(MKLDNNROOT)/include - LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn + LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(MKLDNNROOT)/lib endif ifeq ($(USE_OPERATOR_TUNING), 1) From 8f6162484c209099bce8b4533f6efa9112ccf5f3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 22:04:49 +0000 Subject: [PATCH 202/264] use rpath with $ORIGIN. --- Makefile | 2 +- prepare_mkldnn.sh | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2fbdbb11955f..48b2471811f8 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ ifeq ($(USE_MKLDNN), 1) LDFLAGS += -L$(MKLROOT)/lib endif CFLAGS += -I$(MKLDNNROOT)/include - LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,$(MKLDNNROOT)/lib + LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}' endif ifeq ($(USE_OPERATOR_TUNING), 1) diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh index 8dbd7a87b6df..525ee14775cf 100755 --- a/prepare_mkldnn.sh +++ b/prepare_mkldnn.sh @@ -60,6 +60,7 @@ MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/" MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" +MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib" # MKLDNN install destination HOME_MKLDNN=$1 @@ -86,6 +87,8 @@ if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2 make -C $MKLDNN_BUILDDIR install rm -rf $MKLDNN_BUILDDIR + mkdir -p $MKLDNN_LIBDIR + cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR fi MKLDNNROOT=$MKLDNN_INSTALLDIR fi From d9eae20fa48385b6973a0c126a8d2ebb67eb95d8 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 23:18:51 +0000 Subject: [PATCH 203/264] Pack all lib files in Jenkins. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 94d56e04ba3f..456ba3e8a507 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3,7 +3,7 @@ // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ // mxnet libraries -mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' +mx_lib = 'lib/*.so*, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes From cb97b07db217ecba81046a1590231117bbf98031 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 23:45:23 +0000 Subject: [PATCH 204/264] pack and unpack mxnet with MKLDNN. --- Jenkinsfile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 456ba3e8a507..0beb08a6f6e6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3,7 +3,8 @@ // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/ // mxnet libraries -mx_lib = 'lib/*.so*, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' +mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' +mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes @@ -163,7 +164,7 @@ try { -j\$(nproc) """ make("cpu_mkldnn", flag) - pack_lib('mkldnn_cpu') + pack_lib('mkldnn_cpu', mx_mkldnn_lib) } } }, @@ -183,7 +184,7 @@ try { -j\$(nproc) """ make("build_cuda", flag) - pack_lib('mkldnn_gpu') + pack_lib('mkldnn_gpu', mx_mkldnn_lib) } } }, @@ -330,7 +331,7 @@ try { node('mxnetlinux-cpu') { ws('workspace/ut-python2-mkldnn-cpu') { init_git() - unpack_lib('mkldnn_cpu') + unpack_lib('mkldnn_cpu', mx_mkldnn_lib) python2_ut('cpu_mkldnn') } } @@ -339,7 +340,7 @@ try { node('mxnetlinux-gpu') { ws('workspace/ut-python2-mkldnn-gpu') { init_git() - unpack_lib('mkldnn_gpu') + unpack_lib('mkldnn_gpu', mx_mkldnn_lib) python2_gpu_ut('gpu_mkldnn') } } @@ -348,7 +349,7 @@ try { node('mxnetlinux-cpu') { ws('workspace/ut-python3-mkldnn-cpu') { init_git() - unpack_lib('mkldnn_cpu') + unpack_lib('mkldnn_cpu', mx_mkldnn_lib) python3_ut('cpu_mkldnn') } } @@ -357,7 +358,7 @@ try { node('mxnetlinux-gpu') { ws('workspace/ut-python3-mkldnn-gpu') { init_git() - unpack_lib('mkldnn_gpu') + unpack_lib('mkldnn_gpu', mx_mkldnn_lib) python3_gpu_ut('gpu_mkldnn') } } From 5dc3c428967c68a97bb70ce83c4436091c036156 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 16:09:41 -0800 Subject: [PATCH 205/264] Update Jenkinsfile --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0beb08a6f6e6..f6a4699087ee 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,7 @@ // mxnet libraries mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' -mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' +mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes From 456086bdaa5e08c2d6ee82d15f0ad4735efc08c2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 18 Dec 2017 16:17:18 -0800 Subject: [PATCH 206/264] Update Jenkinsfile --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f6a4699087ee..ea67fb1765bc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -4,7 +4,7 @@ // mxnet libraries mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' -mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' +mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes From 9153471bb2f0fcd45284bb639eb549983fef8da7 Mon Sep 17 00:00:00 2001 From: Lv Tao Date: Mon, 18 Dec 2017 20:21:59 +0800 Subject: [PATCH 207/264] Add mkldnn batch normalization --- Makefile | 4 + src/operator/nn/batch_norm.cc | 239 ++++++++++++- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 328 ++++++++++++++++++ 3 files changed, 570 insertions(+), 1 deletion(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h diff --git a/Makefile b/Makefile index 48b2471811f8..499dd905baa9 100644 --- a/Makefile +++ b/Makefile @@ -124,6 +124,10 @@ ifeq ($(USE_MKLDNN), 1) LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}' endif +ifeq ($(BN_DEBUG), 1) + CFLAGS += -DMXNET_BN_DEBUG=1 +endif + ifeq ($(USE_OPERATOR_TUNING), 1) CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1 endif diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 04970da68373..dcdf8b03baf3 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -32,6 +32,9 @@ #endif // MXNET_USE_MKL2017 #include #include "../elemwise_op_common.h" +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_batch_norm-inl.h" +#endif /*! \brief inverse standard deviation <-> variance */ #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) @@ -379,6 +382,236 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs, return true; } +static inline bool similar_array(const mxnet::NDArray &arr1, + const mxnet::NDArray &arr2, + float tol) { + float *data1 = reinterpret_cast(arr1.data().dptr_); + float *data2 = reinterpret_cast(arr2.data().dptr_); + if (arr1.shape().Size() != arr2.shape().Size()) + return false; + for (size_t i = 0; i < arr1.shape().Size(); i++) { + if (std::abs(data1[i] - data2[i]) > tol) { + // printf("similar_array: %.8f, %.8f \n", data1[i], data2[i]); + return false; + } + } + std::cout << "similar_array: passed all check, tol=" << tol << std::endl; + return true; +} + +static inline mxnet::NDArray copy_arr(const mxnet::NDArray &arr) { + if (arr.storage_type() == mxnet::kMKLDNNStorage) { + auto mklmem = arr.GetMKLDNNData(); + mxnet::NDArray new_arr(arr.shape(), arr.ctx(), false, arr.dtype()); + auto p = new_arr.data().dptr(); + + mxnet::TShape sh = arr.shape(); + CHECK_EQ(sh.ndim(), 4U); + memory::dims _dim = {static_cast(sh[0]), + static_cast(sh[1]), + static_cast(sh[2]), + static_cast(sh[3])}; + auto user_desc = mkldnn::memory::desc({_dim}, memory::data_type::f32, memory::format::nchw); + auto user_pd = mkldnn::memory::primitive_desc(user_desc, CpuEngine::Get()->get_engine()); + auto user_mem = mkldnn::memory(user_pd); + user_mem.set_data_handle(new_arr.data().dptr_); + std::vector net; + if (user_pd != mklmem->get_primitive_desc()) { + auto re = mkldnn::reorder(*mklmem, user_mem); + net.push_back(re); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + } else { + memcpy(p, mklmem->get_data_handle(), + arr.shape().Size() * 4); + } + return new_arr; + } else if (arr.storage_type() == mxnet::kDefaultStorage) { + mxnet::NDArray new_arr(arr.shape(), arr.ctx(), false, arr.dtype()); + memcpy(new_arr.data().dptr_, arr.data().dptr_, + arr.shape().Size() * 4); + return new_arr; + } else { + LOG(FATAL) << "copy_arr: storage type is not supported"; + } +} + +void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + CHECK_EQ(inputs.size(), 5U); +#if MXNET_USE_MKLDNN == 1 + if (SupportMKLDNN(inputs[0])) { + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); + std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); + + switch (inputs[0].dtype()) { + case mshadow::kFloat32: +#if MXNET_BN_DEBUG == 1 + std::cout << "BatchNorm runs into MKLDNN debug" << std::endl; + std::vector inp; + for (size_t i = 0; i < in_data.size(); i++) { + inp.push_back(copy_arr(in_data[i])); + } + + std::vector out; + out.push_back(outputs[0]); + out.push_back(copy_arr(outputs[1])); + out.push_back(copy_arr(outputs[2])); + MKLDNNBatchNorm_Forward(ctx, param, inp, req, out, aux_states); + auto temp_output = copy_arr(out[0]); + + // Run with original path + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) { + out_blobs[i] = outputs[i].data(); + } + BatchNormCompute(attrs, ctx, in_blobs, req, out_blobs); + CHECK_EQ(similar_array(temp_output, outputs[0], 1e-8), true); +#else + MKLDNNBatchNorm_Forward(ctx, param, in_data, req, outputs, aux_states); +#endif + return; + } + } +#endif + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) { + out_blobs[i] = outputs[i].data(); + } + BatchNormCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + CHECK_EQ(inputs.size(), 11U); +#if MXNET_USE_MKLDNN == 1 + if (SupportMKLDNN(inputs[0])) { + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + std::vector out_grad(inputs.begin(), + inputs.begin() + (param.output_mean_var ? 3U : 1U)); + std::vector in_data(inputs.begin() + 3U, inputs.begin() + 6U); + std::vector aux_states(inputs.begin() + 6U, inputs.begin() + 8U); + std::vector out_data(inputs.begin() + 8U, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3U); + + switch (inputs[0].dtype()) { + case mshadow::kFloat32: +#if MXNET_BN_DEBUG == 1 + std::cout << "BatchNorm backward runs into MKLDNN debug" << std::endl; + std::vector inp; + for (size_t i = 0; i < in_data.size(); i++) { + inp.push_back(copy_arr(in_data[i])); + } + + std::vector out; + out.push_back(in_grad[0]); + out.push_back(copy_arr(in_grad[1])); + out.push_back(copy_arr(in_grad[2])); + + MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, + out_data, req, out, aux_states); + auto temp_output = copy_arr(out[0]); + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) { + out_blobs[i] = outputs[i].data(); + } + BatchNormGradCompute(attrs, ctx, in_blobs, req, out_blobs); + CHECK_EQ(similar_array(temp_output, in_grad[0], 1e-8), true); +#else + MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); +#endif + return; + } + } +#endif + // cast NDArray to TBlob, and call original implementation. + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) { + out_blobs[i] = outputs[i].data(); + } + BatchNormGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 5); + CHECK_EQ(out_attrs->size(), 3); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } + (*out_attrs)[0] = kMKLDNNStorage; + (*out_attrs)[1] = kDefaultStorage; + (*out_attrs)[2] = kDefaultStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) { + (*out_attrs)[i] = kDefaultStorage; + } + return true; +} + +static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 11); + CHECK_EQ(out_attrs->size(), 5); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } + (*out_attrs)[0] = kMKLDNNStorage; + (*out_attrs)[1] = kDefaultStorage; + (*out_attrs)[2] = kDefaultStorage; + (*out_attrs)[3] = kDefaultStorage; + (*out_attrs)[4] = kDefaultStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) { + (*out_attrs)[i] = kDefaultStorage; + } + return true; +} + NNVM_REGISTER_OP(BatchNorm) .describe(R"code(Batch normalization. @@ -446,7 +679,9 @@ then set ``gamma`` to 1 and its gradient to 0. }) .set_attr("FInferShape", BatchNormShape) .set_attr("FInferType", BatchNormType) +.set_attr("FInferStorageType", BatchNormStorageType) .set_attr("FCompute", BatchNormCompute) +.set_attr("FComputeEx", BatchNormCompute_CPU) .set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") @@ -468,8 +703,10 @@ then set ``gamma`` to 1 and its gradient to 0. NNVM_REGISTER_OP(_backward_BatchNorm) .set_num_outputs(5) .set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_BatchNormStorageType) .set_attr_parser(ParamParser) -.set_attr("FCompute", BatchNormGradCompute); +.set_attr("FCompute", BatchNormGradCompute) +.set_attr("FComputeEx", BatchNormGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h new file mode 100644 index 000000000000..9ca0378bfa7d --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_batch_norm.cc + * \brief + * \author Tao Lv +*/ + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include +#include "../batch_norm-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) +#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$)) +namespace mxnet { +namespace op { + +typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc; +typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc; +typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc; +typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc; + +using mkldnn::use_global_stats; +using mkldnn::use_scale_shift; +using mkldnn::forward_training; +using mkldnn::forward_inference; + +inline static unsigned _GetFlags(const std::vector &in_data, + const std::vector &aux_states, + const BatchNormParam ¶m, bool is_train) { + unsigned flags = 0U; + if (in_data.size() == 3U) { + flags |= use_scale_shift; + } + + // aux_states[0]: inMean + // aux_states[1]: inVariance + if (aux_states.size() == 2U && !is_train) { + flags |= use_global_stats; + } + return flags; +} + +template +inline static t_bn_f_pdesc _GetFwd(const NDArray &data, bool is_train, + DType eps, unsigned flags) { + auto data_mem = data.GetMKLDNNData(); + auto data_mpd = data_mem->get_primitive_desc(); + auto data_md = data_mpd.desc(); + auto engine = CpuEngine::Get()->get_engine(); + + if (is_train) { + t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags); + return t_bn_f_pdesc(bnFwd_desc, engine); + } else { + t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags); + return t_bn_f_pdesc(bnFwd_desc, engine); + } +} + +template +inline static t_bn_b_pdesc _GetBwd(const NDArray &data, const NDArray &diff_data, + DType eps, unsigned flags) { + auto data_mem = data.GetMKLDNNData(); + auto data_mpd = data_mem->get_primitive_desc(); + auto data_md = data_mpd.desc(); + auto diff_mem = diff_data.GetMKLDNNData(); + auto diff_mpd = diff_mem->get_primitive_desc(); + auto diff_md = diff_mpd.desc(); + auto engine = CpuEngine::Get()->get_engine(); + + t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags); + return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data, true, eps, flags)); +} + +template +void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); + const NDArray &data = in_data[batchnorm::kData]; + + auto data_mem = data.GetMKLDNNData(); + auto fwd_pd = _GetFwd(data, ctx.is_train, (DType) param.eps, flags); + const NDArray &out = out_data[batchnorm::kOut]; + + // for output memory + auto out_mem = const_cast(out).CreateMKLDNNData(fwd_pd.dst_primitive_desc()); + + // mxnet will always use scale shift. + // But if fix_gamma is true, then all scale elements will be set to 1.0f + if (flags & use_scale_shift) { + const NDArray &gamma = in_data[batchnorm::kGamma]; + const NDArray &beta = in_data[batchnorm::kBeta]; + CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage); + + // TODO(tao): how to reuse this memory? + std::shared_ptr weight_mem( + new mkldnn::memory(fwd_pd.weights_primitive_desc())); + DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); + + nnvm::dim_t channels_ = data.shape()[1]; + for (int i = 0; i < channels_; i++) { + if (!param.fix_gamma) + weight_buf[i] = (gamma.data().dptr())[i]; // weight + else + weight_buf[i] = (DType)1.0f; + } + + for (int i = 0; i < channels_; i++) { + weight_buf[channels_ + i] = (beta.data().dptr())[i]; // bias + } + + if (!ctx.is_train) { + // std::cout << "bn forward: inference and no global status" << std::endl; + DType* omean = out_data[batchnorm::kMean].data().dptr(); + DType* ovar = out_data[batchnorm::kVar].data().dptr(); + DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); + DType* invar = aux_states[batchnorm::kMovingVar].data().dptr(); + // to align with origin implmentation: batch_norm.cc: L164 + for (int i = 0; i < channels_; i++) { + omean[i] = (aux_states[batchnorm::kMovingMean].data().dptr())[i]; + ovar[i] = VARIANCE_TO_INVSTD( + (aux_states[batchnorm::kMovingVar].data().dptr())[i], param.eps); + } + std::shared_ptr mean_m( + new mkldnn::memory(fwd_pd.mean_primitive_desc(), inmean)); + std::shared_ptr var_m( + new mkldnn::memory(fwd_pd.variance_primitive_desc(), invar)); + auto bn = mkldnn::batch_normalization_forward(fwd_pd, + *data_mem, + mkldnn::primitive::at(*mean_m), + mkldnn::primitive::at(*var_m), + *weight_mem, + *out_mem); + MKLDNNStream::Get()->RegisterPrim(bn); + MKLDNNStream::Get()->Submit(); + return; + } else { // training + // std::cout << "bn forward here.." << std::endl; + const NDArray &outMean = out_data[batchnorm::kMean]; + const NDArray &outVar = out_data[batchnorm::kVar]; + CHECK_EQ(outMean.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(outVar.storage_type(), mxnet::kDefaultStorage); + DType* omean = out_data[batchnorm::kMean].data().dptr(); + DType* ovar = out_data[batchnorm::kVar].data().dptr(); + + std::shared_ptr mean_mem( + new mkldnn::memory(fwd_pd.mean_primitive_desc(), omean)); + std::shared_ptr var_mem( + new mkldnn::memory(fwd_pd.variance_primitive_desc(), ovar)); + + auto bn = mkldnn::batch_normalization_forward(fwd_pd, + mkldnn::primitive::at(*data_mem), + mkldnn::primitive::at(*weight_mem), + *out_mem, + *mean_mem, + *var_mem); + MKLDNNStream::Get()->RegisterPrim(bn); + MKLDNNStream::Get()->Submit(); + for (int i = 0; i < channels_; i++) { + omean[i] = (reinterpret_cast(mean_mem->get_data_handle()))[i]; + ovar[i] = VARIANCE_TO_INVSTD( + (reinterpret_cast(var_mem->get_data_handle()))[i], param.eps); + } + return; + } + } else { // no input gamma and beta + LOG(FATAL) << "MKLDNN batch normalization: should not reach here ..."; + } + return; +} + +template +void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(in_grad.size(), 3U); + unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); + + const NDArray &data = in_data[batchnorm::kData]; + const NDArray &diff = out_grad[batchnorm::kOut]; + const NDArray &gradIn = in_grad[batchnorm::kData]; + const NDArray &moving_mean = aux_states[batchnorm::kMovingMean]; + const NDArray &moving_var = aux_states[batchnorm::kMovingVar]; + const NDArray &out_mean = out_data[batchnorm::kMean]; + const NDArray &out_var = out_data[batchnorm::kVar]; + + CHECK_EQ(out_mean.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(out_var.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(moving_mean.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(moving_var.storage_type(), mxnet::kDefaultStorage); + + auto data_mem = data.GetMKLDNNData(); + auto diff_mem = diff.GetMKLDNNData(); + auto fwd_pd = _GetFwd(data, ctx.is_train, param.eps, flags); + auto bwd_pd = _GetBwd(data, diff, param.eps, flags); + auto gradi_mem = const_cast(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc()); + + if (flags & use_scale_shift) { + const NDArray &gamma = in_data[batchnorm::kGamma]; + const NDArray &beta = in_data[batchnorm::kBeta]; + // TODO(tao): how to reuse this memory? + std::shared_ptr weight_mem( + new mkldnn::memory(bwd_pd.weights_primitive_desc())); + + DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); + nnvm::dim_t channels_ = data.shape()[1]; + for (int i = 0; i < channels_; i++) { + if (!param.fix_gamma) + weight_buf[i] = (gamma.data().dptr())[i]; // weight + else + weight_buf[i] = (DType)1.0f; + } + + for (int i = 0; i < channels_; i++) { + weight_buf[channels_ + i] = (beta.data().dptr())[i]; // bias + } + + std::shared_ptr gradw_mem( + new mkldnn::memory(bwd_pd.diff_weights_primitive_desc())); + // training but no input mean and variance + if (ctx.is_train && !param.use_global_stats) { + // std::cout << "bn backward here .." << std::endl; + DType* moving_mean_ptr = reinterpret_cast(moving_mean.data().dptr()); + DType* moving_var_ptr = reinterpret_cast(moving_var.data().dptr()); + DType* out_mean_ptr = reinterpret_cast(out_mean.data().dptr()); + DType* out_var_ptr = reinterpret_cast(out_var.data().dptr()); + + DType minus_mom = (1.0f - param.momentum); + for (int i = 0; i < channels_; i++) { + moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + + out_mean_ptr[i] * minus_mom; + moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + + out_var_ptr[i] * minus_mom; + } + + std::shared_ptr out_mean_mem( + new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr)); + std::shared_ptr out_var_mem( + new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr)); + + auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, + *data_mem, + mkldnn::primitive::at(*out_mean_mem), + mkldnn::primitive::at(*out_var_mem), + *diff_mem, + *weight_mem, + *gradi_mem, + *gradw_mem); + + MKLDNNStream::Get()->RegisterPrim(bn_bwd); + MKLDNNStream::Get()->Submit(); + } else { + std::shared_ptr imean_mem( + new mkldnn::memory(bwd_pd.mean_primitive_desc(), + moving_mean.data().dptr())); + std::shared_ptr ivar_mem( + new mkldnn::memory(bwd_pd.variance_primitive_desc(), + moving_var.data().dptr())); + auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, + *data_mem, + mkldnn::primitive::at(*imean_mem), + mkldnn::primitive::at(*ivar_mem), + *diff_mem, + *weight_mem, + *gradi_mem, + *gradw_mem); + + MKLDNNStream::Get()->RegisterPrim(bn_bwd); + MKLDNNStream::Get()->Submit(); + } + + // copy data from gradw_mem to in_grad[1] and in_grad[2] + DType* gw_buf = reinterpret_cast(gradw_mem->get_data_handle()); + for (int i = 0; i < channels_; i++) { + if (!param.fix_gamma) + (in_grad[1].data().dptr())[i] = gw_buf[i]; + else + (in_grad[1].data().dptr())[i] = 0.0f; + } + + for (int i = 0; i < channels_; i++) { + (in_grad[2].data().dptr())[i] = gw_buf[i + channels_]; + } + return; + } else { + LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ..."; + return; + } +} +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ From 91fcd10ba5bf215987bc07540faa205b9b5cfa07 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 07:19:21 +0000 Subject: [PATCH 208/264] Fix bugs in BN. --- src/operator/nn/batch_norm-inl.h | 1 + src/operator/nn/batch_norm.cc | 119 ++++-------------- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 47 +++---- 3 files changed, 47 insertions(+), 120 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 22234dec0699..ccedd4685f5c 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -50,6 +50,7 @@ namespace batchnorm { enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean, kInMovingVar}; // kGamma: weights, kBeta: biases enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data +enum BatchNormOpResource {kTempSpace}; enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states /*! \brief Default channel axis if none specified int he params */ diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index dcdf8b03baf3..ccea73c19dfc 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -399,42 +399,6 @@ static inline bool similar_array(const mxnet::NDArray &arr1, return true; } -static inline mxnet::NDArray copy_arr(const mxnet::NDArray &arr) { - if (arr.storage_type() == mxnet::kMKLDNNStorage) { - auto mklmem = arr.GetMKLDNNData(); - mxnet::NDArray new_arr(arr.shape(), arr.ctx(), false, arr.dtype()); - auto p = new_arr.data().dptr(); - - mxnet::TShape sh = arr.shape(); - CHECK_EQ(sh.ndim(), 4U); - memory::dims _dim = {static_cast(sh[0]), - static_cast(sh[1]), - static_cast(sh[2]), - static_cast(sh[3])}; - auto user_desc = mkldnn::memory::desc({_dim}, memory::data_type::f32, memory::format::nchw); - auto user_pd = mkldnn::memory::primitive_desc(user_desc, CpuEngine::Get()->get_engine()); - auto user_mem = mkldnn::memory(user_pd); - user_mem.set_data_handle(new_arr.data().dptr_); - std::vector net; - if (user_pd != mklmem->get_primitive_desc()) { - auto re = mkldnn::reorder(*mklmem, user_mem); - net.push_back(re); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - } else { - memcpy(p, mklmem->get_data_handle(), - arr.shape().Size() * 4); - } - return new_arr; - } else if (arr.storage_type() == mxnet::kDefaultStorage) { - mxnet::NDArray new_arr(arr.shape(), arr.ctx(), false, arr.dtype()); - memcpy(new_arr.data().dptr_, arr.data().dptr_, - arr.shape().Size() * 4); - return new_arr; - } else { - LOG(FATAL) << "copy_arr: storage type is not supported"; - } -} - void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, @@ -449,34 +413,7 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, switch (inputs[0].dtype()) { case mshadow::kFloat32: -#if MXNET_BN_DEBUG == 1 - std::cout << "BatchNorm runs into MKLDNN debug" << std::endl; - std::vector inp; - for (size_t i = 0; i < in_data.size(); i++) { - inp.push_back(copy_arr(in_data[i])); - } - - std::vector out; - out.push_back(outputs[0]); - out.push_back(copy_arr(outputs[1])); - out.push_back(copy_arr(outputs[2])); - MKLDNNBatchNorm_Forward(ctx, param, inp, req, out, aux_states); - auto temp_output = copy_arr(out[0]); - - // Run with original path - std::vector in_blobs(inputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) { - in_blobs[i] = inputs[i].data(); - } - std::vector out_blobs(outputs.size()); - for (size_t i = 0; i < out_blobs.size(); i++) { - out_blobs[i] = outputs[i].data(); - } - BatchNormCompute(attrs, ctx, in_blobs, req, out_blobs); - CHECK_EQ(similar_array(temp_output, outputs[0], 1e-8), true); -#else MKLDNNBatchNorm_Forward(ctx, param, in_data, req, outputs, aux_states); -#endif return; } } @@ -502,45 +439,23 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { const BatchNormParam ¶m = nnvm::get(attrs.parsed); - std::vector out_grad(inputs.begin(), - inputs.begin() + (param.output_mean_var ? 3U : 1U)); - std::vector in_data(inputs.begin() + 3U, inputs.begin() + 6U); - std::vector aux_states(inputs.begin() + 6U, inputs.begin() + 8U); - std::vector out_data(inputs.begin() + 8U, inputs.end()); - std::vector in_grad(outputs.begin(), outputs.begin() + 3U); + int num_out_grads = param.output_mean_var ? 3U : 1U; + int in_data_start = 3; + int aux_states_start = in_data_start + batchnorm::kInMovingMean; + int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; + + std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); + std::vector in_data(inputs.begin() + in_data_start, + inputs.begin() + aux_states_start); + std::vector aux_states(inputs.begin() + aux_states_start, + inputs.begin() + out_data_start); + std::vector out_data(inputs.begin() + out_data_start, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); switch (inputs[0].dtype()) { case mshadow::kFloat32: -#if MXNET_BN_DEBUG == 1 - std::cout << "BatchNorm backward runs into MKLDNN debug" << std::endl; - std::vector inp; - for (size_t i = 0; i < in_data.size(); i++) { - inp.push_back(copy_arr(in_data[i])); - } - - std::vector out; - out.push_back(in_grad[0]); - out.push_back(copy_arr(in_grad[1])); - out.push_back(copy_arr(in_grad[2])); - - MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, - out_data, req, out, aux_states); - auto temp_output = copy_arr(out[0]); - std::vector in_blobs(inputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) { - in_blobs[i] = inputs[i].data(); - } - - std::vector out_blobs(outputs.size()); - for (size_t i = 0; i < out_blobs.size(); i++) { - out_blobs[i] = outputs[i].data(); - } - BatchNormGradCompute(attrs, ctx, in_blobs, req, out_blobs); - CHECK_EQ(similar_array(temp_output, in_grad[0], 1e-8), true); -#else MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); -#endif return; } } @@ -683,6 +598,11 @@ then set ``gamma`` to 1 and its gradient to 0. .set_attr("FCompute", BatchNormCompute) .set_attr("FComputeEx", BatchNormCompute_CPU) .set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") .add_argument("beta", "NDArray-or-Symbol", "beta array") @@ -704,6 +624,11 @@ NNVM_REGISTER_OP(_backward_BatchNorm) .set_num_outputs(5) .set_attr("TIsBackward", true) .set_attr("FInferStorageType", backward_BatchNormStorageType) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr_parser(ParamParser) .set_attr("FCompute", BatchNormGradCompute) .set_attr("FComputeEx", BatchNormGradCompute_CPU); diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 9ca0378bfa7d..64f79abf7f57 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -65,10 +65,9 @@ inline static unsigned _GetFlags(const std::vector &in_data, } template -inline static t_bn_f_pdesc _GetFwd(const NDArray &data, bool is_train, +inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, bool is_train, DType eps, unsigned flags) { - auto data_mem = data.GetMKLDNNData(); - auto data_mpd = data_mem->get_primitive_desc(); + auto data_mpd = data_mem.get_primitive_desc(); auto data_md = data_mpd.desc(); auto engine = CpuEngine::Get()->get_engine(); @@ -82,18 +81,16 @@ inline static t_bn_f_pdesc _GetFwd(const NDArray &data, bool is_train, } template -inline static t_bn_b_pdesc _GetBwd(const NDArray &data, const NDArray &diff_data, +inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, const mkldnn::memory &diff_mem, DType eps, unsigned flags) { - auto data_mem = data.GetMKLDNNData(); - auto data_mpd = data_mem->get_primitive_desc(); + auto data_mpd = data_mem.get_primitive_desc(); auto data_md = data_mpd.desc(); - auto diff_mem = diff_data.GetMKLDNNData(); - auto diff_mpd = diff_mem->get_primitive_desc(); + auto diff_mpd = diff_mem.get_primitive_desc(); auto diff_md = diff_mpd.desc(); auto engine = CpuEngine::Get()->get_engine(); t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags); - return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data, true, eps, flags)); + return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags)); } template @@ -102,11 +99,12 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, const std::vector &req, const std::vector &out_data, const std::vector &aux_states) { + TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); const NDArray &data = in_data[batchnorm::kData]; auto data_mem = data.GetMKLDNNData(); - auto fwd_pd = _GetFwd(data, ctx.is_train, (DType) param.eps, flags); + auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, (DType) param.eps, flags); const NDArray &out = out_data[batchnorm::kOut]; // for output memory @@ -129,8 +127,11 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, for (int i = 0; i < channels_; i++) { if (!param.fix_gamma) weight_buf[i] = (gamma.data().dptr())[i]; // weight - else + else { weight_buf[i] = (DType)1.0f; + if (IsBNWriting(req[batchnorm::kGamma])) + (gamma.data().dptr())[i] = (DType)1.0f; + } } for (int i = 0; i < channels_; i++) { @@ -138,7 +139,6 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, } if (!ctx.is_train) { - // std::cout << "bn forward: inference and no global status" << std::endl; DType* omean = out_data[batchnorm::kMean].data().dptr(); DType* ovar = out_data[batchnorm::kVar].data().dptr(); DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); @@ -161,9 +161,7 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, *out_mem); MKLDNNStream::Get()->RegisterPrim(bn); MKLDNNStream::Get()->Submit(); - return; } else { // training - // std::cout << "bn forward here.." << std::endl; const NDArray &outMean = out_data[batchnorm::kMean]; const NDArray &outVar = out_data[batchnorm::kVar]; CHECK_EQ(outMean.storage_type(), mxnet::kDefaultStorage); @@ -189,12 +187,10 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, ovar[i] = VARIANCE_TO_INVSTD( (reinterpret_cast(var_mem->get_data_handle()))[i], param.eps); } - return; } } else { // no input gamma and beta LOG(FATAL) << "MKLDNN batch normalization: should not reach here ..."; } - return; } template @@ -205,6 +201,7 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, const std::vector &req, const std::vector &in_grad, const std::vector &aux_states) { + TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); CHECK_EQ(in_data.size(), 3U); CHECK_EQ(out_data.size(), 3U); @@ -226,8 +223,10 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, auto data_mem = data.GetMKLDNNData(); auto diff_mem = diff.GetMKLDNNData(); - auto fwd_pd = _GetFwd(data, ctx.is_train, param.eps, flags); - auto bwd_pd = _GetBwd(data, diff, param.eps, flags); + if (diff_mem->get_primitive_desc() != data_mem->get_primitive_desc()) { + data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc()); + } + auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags); auto gradi_mem = const_cast(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc()); if (flags & use_scale_shift) { @@ -238,6 +237,7 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, new mkldnn::memory(bwd_pd.weights_primitive_desc())); DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); + // TODO does batch norm only work on 4D array? nnvm::dim_t channels_ = data.shape()[1]; for (int i = 0; i < channels_; i++) { if (!param.fix_gamma) @@ -254,18 +254,21 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, new mkldnn::memory(bwd_pd.diff_weights_primitive_desc())); // training but no input mean and variance if (ctx.is_train && !param.use_global_stats) { - // std::cout << "bn backward here .." << std::endl; DType* moving_mean_ptr = reinterpret_cast(moving_mean.data().dptr()); DType* moving_var_ptr = reinterpret_cast(moving_var.data().dptr()); DType* out_mean_ptr = reinterpret_cast(out_mean.data().dptr()); DType* out_var_ptr = reinterpret_cast(out_var.data().dptr()); + mkldnn::memory var_mem(bwd_pd.variance_primitive_desc()); + DType *tmp_var_ptr = reinterpret_cast(var_mem.get_data_handle()); DType minus_mom = (1.0f - param.momentum); for (int i = 0; i < channels_; i++) { moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + out_mean_ptr[i] * minus_mom; + float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps); + tmp_var_ptr[i] = variance; moving_var_ptr[i] = moving_var_ptr[i] * param.momentum - + out_var_ptr[i] * minus_mom; + + variance * minus_mom; } std::shared_ptr out_mean_mem( @@ -276,7 +279,7 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, *data_mem, mkldnn::primitive::at(*out_mean_mem), - mkldnn::primitive::at(*out_var_mem), + mkldnn::primitive::at(var_mem), *diff_mem, *weight_mem, *gradi_mem, @@ -316,10 +319,8 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, for (int i = 0; i < channels_; i++) { (in_grad[2].data().dptr())[i] = gw_buf[i + channels_]; } - return; } else { LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ..."; - return; } } } // namespace op From 4c3aeb5b316e6c3ea9c08d15fa2a43a9c66a0e9b Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 17:46:32 +0000 Subject: [PATCH 209/264] Avoid memory allocation in MKLDNNCopy. --- src/operator/nn/mkldnn/mkldnn_copy.cc | 1 + src/operator/tensor/elemwise_unary_op_basic.cc | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc index 19350ed69290..71d540c969cd 100644 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -34,6 +34,7 @@ namespace op { void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const NDArray &in_data, const OpReqType &req, const NDArray &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[0]); auto in_mem = in_data.GetMKLDNNData(); if (req == kAddTo) { TmpMemMgr::Get()->Init(ctx.requested[0]); diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 3590c088b66d..bcbe55c9406c 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -183,6 +183,11 @@ NNVM_REGISTER_OP(_backward_copy) .set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) .set_attr("FComputeEx", CopyEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; From 6507e822a839e22348839a4396a43aa1f5cfaf22 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 18:42:35 +0000 Subject: [PATCH 210/264] only use MKLDNN BatchNorm for special cases. MKLDNN BatchNorm doesn't work well on the default layout. --- src/operator/nn/batch_norm.cc | 50 +++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index ccea73c19dfc..7c8d98b26014 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -399,6 +399,21 @@ static inline bool similar_array(const mxnet::NDArray &arr1, return true; } +static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { + TShape shape = input.shape(); + bool support = input.storage_type() == kMKLDNNStorage && shape.ndim() == 4 + && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS + && shape[param.axis] % 8 == 0; + if (support) { + // We need to test its data layout. MKLDNN batchnorm doesn't work well on + // the default layout. + auto mem = input.GetMKLDNNData(); + auto desc = mem->get_primitive_desc().desc(); + support = desc.data.format != GetDefaultFormat(desc); + } + return support; +} + void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, @@ -406,8 +421,8 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, const std::vector &outputs) { CHECK_EQ(inputs.size(), 5U); #if MXNET_USE_MKLDNN == 1 - if (SupportMKLDNN(inputs[0])) { - const BatchNormParam ¶m = nnvm::get(attrs.parsed); + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + if (SupportMKLDNNBN(inputs[0], param)) { std::vector in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); @@ -436,14 +451,16 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, const std::vector &req, const std::vector &outputs) { CHECK_EQ(inputs.size(), 11U); -#if MXNET_USE_MKLDNN == 1 - if (SupportMKLDNN(inputs[0])) { - const BatchNormParam ¶m = nnvm::get(attrs.parsed); - int num_out_grads = param.output_mean_var ? 3U : 1U; - int in_data_start = 3; - int aux_states_start = in_data_start + batchnorm::kInMovingMean; - int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + int num_out_grads = param.output_mean_var ? 3U : 1U; + int in_data_start = 3; + int aux_states_start = in_data_start + batchnorm::kInMovingMean; + int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; + TShape shape = inputs[0].shape(); +#if MXNET_USE_MKLDNN == 1 + if (SupportMKLDNNBN(inputs[0], param) + && inputs[in_data_start].storage_type() == kMKLDNNStorage) { std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); std::vector in_data(inputs.begin() + in_data_start, inputs.begin() + aux_states_start); @@ -452,12 +469,11 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, std::vector out_data(inputs.begin() + out_data_start, inputs.end()); std::vector in_grad(outputs.begin(), outputs.begin() + 3); - switch (inputs[0].dtype()) { - case mshadow::kFloat32: - MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); - return; - } + if (inputs[0].dtype() == mshadow::kFloat32) { + MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); + return; + } } #endif // cast NDArray to TBlob, and call original implementation. @@ -481,7 +497,7 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), 5); CHECK_EQ(out_attrs->size(), 3); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (int& v : *in_attrs) { if (v == - 1) v = kDefaultStorage; @@ -507,7 +523,7 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), 11); CHECK_EQ(out_attrs->size(), 5); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (int& v : *in_attrs) { if (v == - 1) v = kDefaultStorage; From d47b7440b42bdfb5130c2dfd5925475b84f85291 Mon Sep 17 00:00:00 2001 From: pengzhao-intel Date: Tue, 19 Dec 2017 15:57:48 +0800 Subject: [PATCH 211/264] Add MKL-DNN based LRN --- src/operator/nn/lrn.cc | 106 +++++++++++++++++- src/operator/nn/mkldnn/mkldnn_lrn-inl.h | 137 ++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 2 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_lrn-inl.h diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 53769c1c4c7d..51a69d04b15e 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -25,6 +25,12 @@ */ #include "./lrn-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn/cudnn_lrn-inl.h" +#endif +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_lrn-inl.h" +#endif namespace mxnet { namespace op { @@ -68,7 +74,7 @@ static bool LRNType(const nnvm::NodeAttrs& attrs, struct LRNGrad { const char *op_name; std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { + const std::vector& ograds) const { std::vector heads; heads.push_back(ograds[0]); // out_grad heads.push_back(n->inputs[lrn_enum::kData]); @@ -77,6 +83,98 @@ struct LRNGrad { } }; +inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kMKLDNNStorage; + return true; + } +#endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) + (*out_attrs)[i] = kDefaultStorage; + return true; +} + +void LRNCompute_CPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + const LRNParam ¶m = nnvm::get(attrs.parsed); + if (SupportMKLDNN(inputs[0])) { + MKLDNNLRN_Forward(ctx, param, inputs[0], req[0], outputs[0]); + return; + } +#endif + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + LRNCompute(attrs, ctx, in_blobs, req, out_blobs); +} + +void LRNGradCompute_CPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { +#if MXNET_USE_MKLDNN == 1 + const LRNParam ¶m = nnvm::get(attrs.parsed); + const NDArray &out_grad = inputs[0]; + const NDArray &in_data = inputs[1]; + const NDArray &in_grad = outputs[0]; + + if (SupportMKLDNN(inputs[0])) { + MKLDNNLRN_Backward(ctx, param, out_grad, in_data, + req[0], in_grad); + return; + } +#endif + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + + LRNGradCompute(attrs, ctx, in_blobs, req, out_blobs); +} + DMLC_REGISTER_PARAMETER(LRNParam); NNVM_REGISTER_OP(LRN) @@ -103,7 +201,9 @@ number of kernels in the layer. .set_attr_parser(ParamParser) .set_attr("FInferShape", LRNShape) .set_attr("FInferType", LRNType) +.set_attr("FInferStorageType", LRNForwardInferStorageType) .set_attr("FCompute", LRNCompute) +.set_attr("FComputeEx", LRNCompute_CPU) .set_attr("FGradient", LRNGrad{"_backward_LRN"}) .add_argument("data", "NDArray-or-Symbol", "Input data to LRN") .add_arguments(LRNParam::__FIELDS__()); @@ -111,8 +211,10 @@ number of kernels in the layer. NNVM_REGISTER_OP(_backward_LRN) .set_num_outputs(1) .set_attr_parser(ParamParser) +.set_attr("FInferStorageType", LRNBackwardInferStorageType) .set_attr("TIsBackward", true) -.set_attr("FCompute", LRNGradCompute); +.set_attr("FCompute", LRNGradCompute) +.set_attr("FComputeEx", LRNGradCompute_CPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h new file mode 100644 index 000000000000..b7daa7b16ead --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_lrn-inl.h + * \brief + * \Author: Patric Zhao, patric.zhao@intel.com +*/ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include "../lrn-inl.h" +#include "./mkldnn_base-inl.h" + +namespace mxnet { +namespace op { + +static inline algorithm GetMKLDNNLRNAlgo(const LRNParam ¶m) { + // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward + // Need to fix from MKLDNN + return algorithm::lrn_across_channels; +} + +inline static lrn_forward::primitive_desc GetLRNFwd( + const LRNParam ¶m, bool is_train, const memory::desc &src_md) { + auto engine = CpuEngine::Get()->get_engine(); + auto alg_ = GetMKLDNNLRNAlgo(param); + auto alpha_ = param.alpha; + auto beta_ = param.beta; + auto nsize_ = param.nsize; + auto k_ = param.knorm; + auto kind_ = prop_kind::forward_training; + if (is_train) { + kind_ = prop_kind::forward_training; + } else { + kind_ = prop_kind::forward_scoring; + } + lrn_forward::desc fwd_desc_(kind_, alg_, src_md, nsize_, alpha_, beta_, k_); + return mkldnn::lrn_forward::primitive_desc(fwd_desc_, engine); +} + +inline static mkldnn::lrn_backward::primitive_desc GetLRNBwd( + const LRNParam ¶m, const mkldnn::memory::desc &diff_in_md, + const mkldnn::memory::desc &diff_md, + const lrn_forward::primitive_desc &lrnFwd_desc) { + auto engine = CpuEngine::Get()->get_engine(); + auto alg_ = GetMKLDNNLRNAlgo(param); + auto alpha_ = param.alpha; + auto beta_ = param.beta; + int nsize_ = param.nsize; + auto k_ = param.knorm; + + lrn_backward::desc lrnBwd_desc(alg_, diff_in_md, + diff_md, nsize_, alpha_, beta_, k_); + return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc, + engine, lrnFwd_desc); +} + +void MKLDNNLRN_Forward(const OpContext &ctx, const LRNParam ¶m, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + auto src_mem = in_data.GetMKLDNNData(); + auto src_md = src_mem->get_primitive_desc().desc(); + auto pdesc = GetLRNFwd(param, ctx.is_train, src_md); + auto dst_mem = const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); + if (ctx.is_train) { + std::shared_ptr ws_mem( + new mkldnn::memory(pdesc.workspace_primitive_desc())); + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), + *ws_mem, *dst_mem)); + MKLDNNStream::Get()->Submit(); + } else { + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem)); + MKLDNNStream::Get()->Submit(); + } +} + +void MKLDNNLRN_Backward(const OpContext &ctx, const LRNParam ¶m, + const NDArray &out_grad, + const NDArray &in_data, + const OpReqType &req, + const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + // Repeat FW for getting workspace + auto data_mem = in_data.GetMKLDNNData(); + auto data_md = data_mem->get_primitive_desc().desc(); + auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md); + + // workspace to share + std::shared_ptr ws_mem( + new mkldnn::memory(pdesc_fwd.workspace_primitive_desc())); + std::shared_ptr dst_temp( + new mkldnn::memory(pdesc_fwd.dst_primitive_desc())); + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem), + *ws_mem, *dst_temp)); + + auto data_in_md = pdesc_fwd.src_primitive_desc().desc(); + auto diff_mem = out_grad.GetMKLDNNData(); + auto diff_md = diff_mem->get_primitive_desc().desc(); + auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd); + + auto diff_src_mem = CreateMKLDNNMem(in_grad, + pdesc_bwd.diff_src_primitive_desc(), req); + + MKLDNNStream::Get()->RegisterPrim( + lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem), + mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second)); + MKLDNNStream::Get()->Submit(); +} +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__ From e7ad056b364a568269361d11c329a8672d304504 Mon Sep 17 00:00:00 2001 From: pengzhao-intel Date: Tue, 19 Dec 2017 16:37:48 +0800 Subject: [PATCH 212/264] Code Style Changes --- src/operator/nn/lrn.cc | 48 ++++++++++--------------- src/operator/nn/mkldnn/mkldnn_lrn-inl.h | 14 ++++---- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 51a69d04b15e..42b42c7bf4e2 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -25,9 +25,6 @@ */ #include "./lrn-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn/cudnn_lrn-inl.h" -#endif #if MXNET_USE_MKLDNN == 1 #include "./mkldnn/mkldnn_lrn-inl.h" #endif @@ -84,13 +81,12 @@ struct LRNGrad { }; inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK(!in_attrs->empty()); - + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { #if MXNET_USE_MKLDNN == 1 + CHECK(!in_attrs->empty()); if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) @@ -98,19 +94,15 @@ inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, return true; } #endif - *dispatch_mode = DispatchMode::kFCompute; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kDefaultStorage; - return true; } inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK(!in_attrs->empty()); + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { #if MXNET_USE_MKLDNN == 1 + CHECK(!in_attrs->empty()); if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; for (size_t i = 0; i < out_attrs->size(); i++) @@ -118,17 +110,13 @@ inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, return true; } #endif - *dispatch_mode = DispatchMode::kFCompute; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kDefaultStorage; - return true; } void LRNCompute_CPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 const LRNParam ¶m = nnvm::get(attrs.parsed); if (SupportMKLDNN(inputs[0])) { @@ -147,10 +135,10 @@ void LRNCompute_CPU(const nnvm::NodeAttrs &attrs, } void LRNGradCompute_CPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 const LRNParam ¶m = nnvm::get(attrs.parsed); const NDArray &out_grad = inputs[0]; diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h index b7daa7b16ead..e0ecc1873d96 100644 --- a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h @@ -35,7 +35,7 @@ namespace op { static inline algorithm GetMKLDNNLRNAlgo(const LRNParam ¶m) { // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward - // Need to fix from MKLDNN + // Need to confirm with MKLDNN team and fix later return algorithm::lrn_across_channels; } @@ -75,8 +75,8 @@ inline static mkldnn::lrn_backward::primitive_desc GetLRNBwd( } void MKLDNNLRN_Forward(const OpContext &ctx, const LRNParam ¶m, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { auto src_mem = in_data.GetMKLDNNData(); auto src_md = src_mem->get_primitive_desc().desc(); auto pdesc = GetLRNFwd(param, ctx.is_train, src_md); @@ -87,7 +87,7 @@ void MKLDNNLRN_Forward(const OpContext &ctx, const LRNParam ¶m, new mkldnn::memory(pdesc.workspace_primitive_desc())); MKLDNNStream::Get()->RegisterPrim( lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), - *ws_mem, *dst_mem)); + *ws_mem, *dst_mem)); MKLDNNStream::Get()->Submit(); } else { MKLDNNStream::Get()->RegisterPrim( @@ -109,7 +109,10 @@ void MKLDNNLRN_Backward(const OpContext &ctx, const LRNParam ¶m, auto data_md = data_mem->get_primitive_desc().desc(); auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md); - // workspace to share + // TODO(Patric): To keep the function stateless, we can't pass workspace + // from LRN forward to backward. We have to re-compute + // LRN forward to get the workspace. + // Will refine this code later. std::shared_ptr ws_mem( new mkldnn::memory(pdesc_fwd.workspace_primitive_desc())); std::shared_ptr dst_temp( @@ -122,7 +125,6 @@ void MKLDNNLRN_Backward(const OpContext &ctx, const LRNParam ¶m, auto diff_mem = out_grad.GetMKLDNNData(); auto diff_md = diff_mem->get_primitive_desc().desc(); auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd); - auto diff_src_mem = CreateMKLDNNMem(in_grad, pdesc_bwd.diff_src_primitive_desc(), req); From 530e2b28ba45f9ad1065eea202bbc2cb68636e84 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 20:40:40 +0000 Subject: [PATCH 213/264] Fix a bug in BN. --- src/operator/nn/batch_norm.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 7c8d98b26014..d5295243134a 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -500,7 +500,7 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (int& v : *in_attrs) { - if (v == - 1) v = kDefaultStorage; + if (v == kUndefinedStorage) v = kDefaultStorage; } (*out_attrs)[0] = kMKLDNNStorage; (*out_attrs)[1] = kDefaultStorage; @@ -508,7 +508,10 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, return true; } #endif - *dispatch_mode = DispatchMode::kFComputeEx; + *dispatch_mode = DispatchMode::kFCompute; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } for (size_t i = 0; i < out_attrs->size(); i++) { (*out_attrs)[i] = kDefaultStorage; } @@ -526,7 +529,7 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { *dispatch_mode = DispatchMode::kFComputeEx; for (int& v : *in_attrs) { - if (v == - 1) v = kDefaultStorage; + if (v == kUndefinedStorage) v = kDefaultStorage; } (*out_attrs)[0] = kMKLDNNStorage; (*out_attrs)[1] = kDefaultStorage; @@ -536,7 +539,10 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, return true; } #endif - *dispatch_mode = DispatchMode::kFComputeEx; + *dispatch_mode = DispatchMode::kFCompute; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } for (size_t i = 0; i < out_attrs->size(); i++) { (*out_attrs)[i] = kDefaultStorage; } From 72f27d595e91dfa0c7364684aedba77ffad86640 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 20:40:58 +0000 Subject: [PATCH 214/264] Fix a bug in LRN. --- src/operator/nn/lrn.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 42b42c7bf4e2..7073054532e8 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -94,6 +94,11 @@ inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, return true; } #endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) { + (*out_attrs)[i] = kDefaultStorage; + } + return true; } inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, @@ -110,6 +115,11 @@ inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, return true; } #endif + *dispatch_mode = DispatchMode::kFCompute; + for (size_t i = 0; i < out_attrs->size(); i++) { + (*out_attrs)[i] = kDefaultStorage; + } + return true; } void LRNCompute_CPU(const nnvm::NodeAttrs &attrs, From 9321cf328e066a3a95e31179b93034f12fce1b94 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 23:18:22 +0000 Subject: [PATCH 215/264] Handle non-default storage in memory plan. --- src/imperative/cached_op.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index eaa95a5f2418..93a8bc6c54b2 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -214,6 +214,12 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph( StorageVector storage(idx.num_node_entries(), exec::kBadStorageID); for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; + const auto& stypes = g.GetAttr("storage_type"); + CHECK_EQ(stypes.size(), storage.size()); + for (size_t i = 0; i < stypes.size(); i++) { + if (stypes[i] != kDefaultStorage) + storage[i] = exec::kDynamicStorageID; + } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >( @@ -320,6 +326,10 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph( for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID; for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID; + for (size_t i = 0; i < stypes.size(); i++) { + if (stypes[i] != kDefaultStorage) + storage[i] = exec::kDynamicStorageID; + } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >("backward_ref_count"), From ea30336f948e3017b4f4006f8a9792a12ed6fc91 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 23:22:16 +0000 Subject: [PATCH 216/264] Fix coding style. --- src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 64f79abf7f57..145619b4ea65 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -125,9 +125,9 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, nnvm::dim_t channels_ = data.shape()[1]; for (int i = 0; i < channels_; i++) { - if (!param.fix_gamma) + if (!param.fix_gamma) { weight_buf[i] = (gamma.data().dptr())[i]; // weight - else { + } else { weight_buf[i] = (DType)1.0f; if (IsBNWriting(req[batchnorm::kGamma])) (gamma.data().dptr())[i] = (DType)1.0f; @@ -237,7 +237,6 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, new mkldnn::memory(bwd_pd.weights_primitive_desc())); DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); - // TODO does batch norm only work on 4D array? nnvm::dim_t channels_ = data.shape()[1]; for (int i = 0; i < channels_; i++) { if (!param.fix_gamma) From 7f741399d13db7c2ce3bce0a98201a5163e55d4f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 19 Dec 2017 23:37:50 +0000 Subject: [PATCH 217/264] Fix a compilation error without mkldnn. --- src/operator/nn/batch_norm.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index d5295243134a..6bfae6fdbbde 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -399,6 +399,7 @@ static inline bool similar_array(const mxnet::NDArray &arr1, return true; } +#if MXNET_USE_MKLDNN == 1 static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { TShape shape = input.shape(); bool support = input.storage_type() == kMKLDNNStorage && shape.ndim() == 4 @@ -413,6 +414,7 @@ static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &p } return support; } +#endif void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, From 03fd841a953e515f78595fd87e39d2ecdf2fa014 Mon Sep 17 00:00:00 2001 From: Lv Tao Date: Wed, 20 Dec 2017 10:33:08 +0800 Subject: [PATCH 218/264] Fix some coding styles for batch norm --- src/operator/nn/batch_norm.cc | 6 +- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 62 ++++++++++--------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 6bfae6fdbbde..00857e8cbb00 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -430,7 +430,7 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, switch (inputs[0].dtype()) { case mshadow::kFloat32: - MKLDNNBatchNorm_Forward(ctx, param, in_data, req, outputs, aux_states); + MKLDNNBatchNormForward(ctx, param, in_data, req, outputs, aux_states); return; } } @@ -472,8 +472,8 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, std::vector in_grad(outputs.begin(), outputs.begin() + 3); if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNBatchNorm_Backward(ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); + MKLDNNBatchNormBackward(ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); return; } } diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 145619b4ea65..035092780eb5 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -49,8 +49,8 @@ using mkldnn::forward_training; using mkldnn::forward_inference; inline static unsigned _GetFlags(const std::vector &in_data, - const std::vector &aux_states, - const BatchNormParam ¶m, bool is_train) { + const std::vector &aux_states, + const BatchNormParam ¶m, bool is_train) { unsigned flags = 0U; if (in_data.size() == 3U) { flags |= use_scale_shift; @@ -65,8 +65,10 @@ inline static unsigned _GetFlags(const std::vector &in_data, } template -inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, bool is_train, - DType eps, unsigned flags) { +inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, + bool is_train, + DType eps, + unsigned flags) { auto data_mpd = data_mem.get_primitive_desc(); auto data_md = data_mpd.desc(); auto engine = CpuEngine::Get()->get_engine(); @@ -81,8 +83,10 @@ inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, bool is_train } template -inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, const mkldnn::memory &diff_mem, - DType eps, unsigned flags) { +inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, + const mkldnn::memory &diff_mem, + DType eps, + unsigned flags) { auto data_mpd = data_mem.get_primitive_desc(); auto data_md = data_mpd.desc(); auto diff_mpd = diff_mem.get_primitive_desc(); @@ -94,11 +98,11 @@ inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, const mkldnn: } template -void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); const NDArray &data = in_data[batchnorm::kData]; @@ -194,13 +198,13 @@ void MKLDNNBatchNorm_Forward(const OpContext &ctx, const BatchNormParam ¶m, } template -void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); CHECK_EQ(in_data.size(), 3U); @@ -262,12 +266,12 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, DType minus_mom = (1.0f - param.momentum); for (int i = 0; i < channels_; i++) { - moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum - + out_mean_ptr[i] * minus_mom; + moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + + out_mean_ptr[i] * minus_mom; float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps); tmp_var_ptr[i] = variance; - moving_var_ptr[i] = moving_var_ptr[i] * param.momentum - + variance * minus_mom; + moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + + variance * minus_mom; } std::shared_ptr out_mean_mem( @@ -276,13 +280,13 @@ void MKLDNNBatchNorm_Backward(const OpContext &ctx, const BatchNormParam ¶m, new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr)); auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, - *data_mem, - mkldnn::primitive::at(*out_mean_mem), - mkldnn::primitive::at(var_mem), - *diff_mem, - *weight_mem, - *gradi_mem, - *gradw_mem); + *data_mem, + mkldnn::primitive::at(*out_mean_mem), + mkldnn::primitive::at(var_mem), + *diff_mem, + *weight_mem, + *gradi_mem, + *gradw_mem); MKLDNNStream::Get()->RegisterPrim(bn_bwd); MKLDNNStream::Get()->Submit(); From 85e020f38ebf934163470ce42e501f9bdca0beb3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 21 Dec 2017 02:10:08 +0000 Subject: [PATCH 219/264] a hack to speed up MKLDNN inference. --- include/mxnet/ndarray.h | 12 +++++++++++ src/ndarray/ndarray.cc | 5 +++++ src/operator/nn/mkldnn/mkldnn_base-inl.h | 2 +- src/operator/nn/mkldnn/mkldnn_base.cc | 20 ++++++++++++++----- src/operator/nn/mkldnn/mkldnn_convolution.cc | 2 +- .../nn/mkldnn/mkldnn_deconvolution.cc | 2 +- 6 files changed, 35 insertions(+), 8 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 93c47788e92d..e4c832a36afd 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -552,6 +552,13 @@ class NDArray { #if MXNET_USE_MKLDNN == 1 bool IsMKLDNNDefault() const; + void SaveMKLDNNReorder(std::shared_ptr reorder) const; + const mkldnn::memory *GetMKLDNNReorder() const { + if (ptr_ != nullptr) + return ptr_->Mkl_reorder_.get(); + else + return nullptr; + } /* * All functions below return a raw pointer to mkldnn memory. Actually there * is a shared pointer that hold the memory either in NDArray or in MKLDNN @@ -627,6 +634,11 @@ class NDArray { /*! This is created when data is stored in MKLDNN format. */ std::shared_ptr Mkl_mem_; + /* + * This contains a copy of the original data. However, the data in this + * member may be out of date. TODO(zhengda) we should fix this problem. + */ + std::shared_ptr Mkl_reorder_; #endif /*! \brief variable from engine */ Engine::VarHandle var; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 2edc6d041aa0..e7232dc438a6 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -418,6 +418,11 @@ bool NDArray::IsMKLDNNDefault() const { } } +void NDArray::SaveMKLDNNReorder(std::shared_ptr reorder) const { + if (ptr_) + ptr_->Mkl_reorder_ = reorder; +} + void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 5b3842604ed9..230473152db1 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -285,7 +285,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups); + int num_groups, bool save_reorder = false); const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1); diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 1cf538f5a86e..723c359b650d 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -80,8 +80,11 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups) { - const mkldnn::memory *mem; + int num_groups, bool save_reorder) { + const mkldnn::memory *mem = arr.GetMKLDNNReorder(); + if (mem != nullptr) + return mem; + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Get()->get_engine(); if (arr.shape().ndim() == 2) { @@ -118,9 +121,16 @@ const mkldnn::memory *GetWeights(const NDArray &arr, } if (mem->get_primitive_desc() == target_pd) return mem; - auto ret = TmpMemMgr::Get()->Alloc(target_pd); - MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); - return ret; + if (save_reorder) { + std::shared_ptr ret(new mkldnn::memory(target_pd)); + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); + arr.SaveMKLDNNReorder(ret); + return ret.get(); + } else { + auto ret = TmpMemMgr::Get()->Alloc(target_pd); + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; + } } const mkldnn::memory *GetWeights(const NDArray &arr, diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 0c39d81deacf..b7bbf261dcd2 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -177,7 +177,7 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd_pd.src_primitive_desc()); auto engine = CpuEngine::Get()->get_engine(); auto weight_mem = GetWeights(in_data[conv::kWeight], - fwd_pd.weights_primitive_desc(), param.num_group); + fwd_pd.weights_primitive_desc(), param.num_group, !ctx.is_train); auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd_pd.dst_primitive_desc(), req[conv::kOut]); diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index eda28e3d8cff..6435fa868bce 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -164,7 +164,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &c auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( deconvFwd_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(in_data[deconv::kWeight], - deconvFwd_pd.weights_primitive_desc(), param.num_group); + deconvFwd_pd.weights_primitive_desc(), param.num_group, !ctx.is_train); auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); From 7f643acc3175372b37db0d43bab928ad88bd7d6d Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 21 Dec 2017 07:49:09 +0000 Subject: [PATCH 220/264] Improve forward of convolution. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 111 ++++++++++++++++--- 1 file changed, 93 insertions(+), 18 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index b7bbf261dcd2..153606988594 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -31,7 +31,7 @@ namespace mxnet { namespace op { -static mkldnn::convolution_forward::primitive_desc GetConvFwd( +static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl( const ConvolutionParam& param, bool is_train, const NDArray &data, const NDArray &weights, const NDArray *bias, const NDArray &output) { auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; @@ -165,30 +165,106 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( } } +class MKLDNNConvForward { + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr weight; + std::shared_ptr bias; + std::shared_ptr out; + + public: + mkldnn::convolution_forward::primitive_desc fwd_pd; + + MKLDNNConvForward(const ConvolutionParam& param, bool is_train, + const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output): fwd_pd( + GetConvFwdImpl(param, is_train, data, weights, bias, output)) { + } + + void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight, + const mkldnn::memory *bias, const mkldnn::memory &output) { + if (this->data == nullptr) + this->data = std::shared_ptr(new mkldnn::memory( + fwd_pd.src_primitive_desc(), data.get_data_handle())); + else + this->data->set_data_handle(data.get_data_handle()); + + if (this->weight == nullptr) + this->weight = std::shared_ptr(new mkldnn::memory( + fwd_pd.weights_primitive_desc(), weight.get_data_handle())); + else + this->weight->set_data_handle(weight.get_data_handle()); + + if (this->out == nullptr) + this->out = std::shared_ptr(new mkldnn::memory( + fwd_pd.dst_primitive_desc(), output.get_data_handle())); + else + this->out->set_data_handle(output.get_data_handle()); + + if (bias != nullptr) { + if (this->bias == nullptr) + this->bias = std::shared_ptr(new mkldnn::memory( + fwd_pd.bias_primitive_desc(), bias->get_data_handle())); + else + this->bias->set_data_handle(bias->get_data_handle()); + if (this->fwd == nullptr) + this->fwd = std::shared_ptr( + new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), + mkldnn::primitive::at(*this->weight), + mkldnn::primitive::at(*this->bias), + *this->out)); + } else if (this->fwd == nullptr) { + this->fwd = std::shared_ptr( + new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), + mkldnn::primitive::at(*this->weight), + *this->out)); + } + } + + const mkldnn::convolution_forward &GetFwd() const { + return *fwd; + } +}; + +static inline MKLDNNConvForward &GetConvFwd( + const nnvm::NodeAttrs& attrs, bool is_train, + const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + static thread_local std::unordered_map fwds; + const ConvolutionParam& param = nnvm::get(attrs.parsed); + intptr_t key = reinterpret_cast(&attrs.parsed); + auto it = fwds.find(key); + if (it == fwds.end()) { + MKLDNNConvForward fwd(param, is_train, data, weights, bias, output); + auto ins_ret = fwds.insert( + std::pair(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); const ConvolutionParam& param = nnvm::get(attrs.parsed); - mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, + MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); - auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd_pd.src_primitive_desc()); - auto engine = CpuEngine::Get()->get_engine(); - auto weight_mem = GetWeights(in_data[conv::kWeight], - fwd_pd.weights_primitive_desc(), param.num_group, !ctx.is_train); - auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], - fwd_pd.dst_primitive_desc(), req[conv::kOut]); - if (param.no_bias) { - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(fwd_pd, - *data_mem, *weight_mem, *out_mem.second)); - } else { - auto bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd_pd.bias_primitive_desc()); - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(fwd_pd, - *data_mem, *weight_mem, *bias_mem, *out_mem.second)); - } + auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); + auto weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), + param.num_group, !ctx.is_train); + auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(), + req[conv::kOut]); + const mkldnn::memory *bias_mem = nullptr; + if (!param.no_bias) + bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc()); + fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second); + MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); + CommitOutput(out_data[conv::kOut], out_mem); MKLDNNStream::Get()->Submit(); } @@ -199,9 +275,8 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct const std::vector& outputs) { TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); const std::vector &in_grad = outputs; - auto engine = CpuEngine::Get()->get_engine(); const ConvolutionParam& param = nnvm::get(attrs.parsed); - mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwd(param, ctx.is_train, + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train, inputs[conv::kData + 1], inputs[conv::kWeight + 1], param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); From 08a67edf6ebf051721d4842ac69140ac38268d6c Mon Sep 17 00:00:00 2001 From: Lv Tao Date: Thu, 21 Dec 2017 16:05:47 +0800 Subject: [PATCH 221/264] Add openmp and simd support to BN operator --- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 035092780eb5..6332f0a90f99 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -128,18 +128,27 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); nnvm::dim_t channels_ = data.shape()[1]; - for (int i = 0; i < channels_; i++) { - if (!param.fix_gamma) { - weight_buf[i] = (gamma.data().dptr())[i]; // weight - } else { + DType* weight_ptr = gamma.data().dptr(); + DType* bias_ptr = beta.data().dptr(); + if (!param.fix_gamma) { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + weight_buf[i] = weight_ptr[i]; + weight_buf[channels_ + i] = bias_ptr[i]; // bias + } + } else if (IsBNWriting(req[batchnorm::kGamma])) { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { weight_buf[i] = (DType)1.0f; - if (IsBNWriting(req[batchnorm::kGamma])) - (gamma.data().dptr())[i] = (DType)1.0f; + weight_ptr[i] = (DType)1.0f; + weight_buf[channels_ + i] = bias_ptr[i]; // bias + } + } else { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + weight_buf[i] = (DType)1.0f; + weight_buf[channels_ + i] = bias_ptr[i]; // bias } - } - - for (int i = 0; i < channels_; i++) { - weight_buf[channels_ + i] = (beta.data().dptr())[i]; // bias } if (!ctx.is_train) { @@ -148,11 +157,12 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); DType* invar = aux_states[batchnorm::kMovingVar].data().dptr(); // to align with origin implmentation: batch_norm.cc: L164 +#pragma omp parallel for simd for (int i = 0; i < channels_; i++) { - omean[i] = (aux_states[batchnorm::kMovingMean].data().dptr())[i]; - ovar[i] = VARIANCE_TO_INVSTD( - (aux_states[batchnorm::kMovingVar].data().dptr())[i], param.eps); + omean[i] = inmean[i]; + ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps); } + std::shared_ptr mean_m( new mkldnn::memory(fwd_pd.mean_primitive_desc(), inmean)); std::shared_ptr var_m( @@ -186,10 +196,12 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, *var_mem); MKLDNNStream::Get()->RegisterPrim(bn); MKLDNNStream::Get()->Submit(); + DType* mean_mem_ptr = reinterpret_cast(mean_mem->get_data_handle()); + DType* var_mem_ptr = reinterpret_cast(var_mem->get_data_handle()); +#pragma omp parallel for simd for (int i = 0; i < channels_; i++) { - omean[i] = (reinterpret_cast(mean_mem->get_data_handle()))[i]; - ovar[i] = VARIANCE_TO_INVSTD( - (reinterpret_cast(var_mem->get_data_handle()))[i], param.eps); + omean[i] = mean_mem_ptr[i]; + ovar[i] = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps); } } } else { // no input gamma and beta From 313b80bb76cdaa88d5571f58dc57b7c343e4df3f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 21 Dec 2017 18:21:02 +0000 Subject: [PATCH 222/264] Retrieve MKLDNN Conv primitive based on signature. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 64 ++++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_convolution.cc | 15 ++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 230473152db1..10bb36170e41 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -271,6 +271,70 @@ class MKLDNNStream { } }; +class MKLDNNOpSignature { + std::vector eles; + uint64_t hash; + public: + void AddSign(const std::vector &arrs) { + for (auto &arr : arrs) { + AddSign(arr); + } + } + + void AddSign(const NDArray &arr) { + hash = hash * 2 + arr.dtype(); + eles.push_back(arr.dtype()); + AddSign(arr.shape()); + } + + void AddSign(const TShape &shape) { + for (size_t i = 0; i < shape.ndim(); i++) { + hash = hash * 2 + shape[i]; + eles.push_back(shape[i]); + } + } + + void AddSign(int val) { + hash = hash * 2 + val; + eles.push_back(val); + } + + bool operator==(const MKLDNNOpSignature &sign) const { + if (hash != sign.hash) + return false; + if (eles.size() != sign.eles.size()) + return false; + for (size_t i = 0; i < eles.size(); i++) + if (eles[i] != sign.eles[i]) + return false; + return true; + } + + uint64_t GetHash() const { + return hash; + } +}; + +struct MKLDNNOpHash { + size_t operator()(const MKLDNNOpSignature &sign) const { + return sign.GetHash(); + } +}; + +template +class MKLDNNParamOpSign: public MKLDNNOpSignature { + const ParamType param; + public: + MKLDNNParamOpSign(const ParamType &_param): param(_param) { + } + + bool operator==(const MKLDNNParamOpSign &sign) const { + const MKLDNNOpSignature &this_upper = *this; + const MKLDNNOpSignature &other_upper = sign; + return this_upper == other_upper && param == sign.param; + } +}; + enum OutDataOp { Noop, CopyBack, diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 153606988594..9ceeacddcaa9 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -226,18 +226,27 @@ class MKLDNNConvForward { } }; +typedef MKLDNNParamOpSign MKLDNNConvSignature; + static inline MKLDNNConvForward &GetConvFwd( const nnvm::NodeAttrs& attrs, bool is_train, const NDArray &data, const NDArray &weights, const NDArray *bias, const NDArray &output) { - static thread_local std::unordered_map fwds; + static thread_local std::unordered_map fwds; const ConvolutionParam& param = nnvm::get(attrs.parsed); - intptr_t key = reinterpret_cast(&attrs.parsed); + MKLDNNConvSignature key(param); + key.AddSign(is_train); + key.AddSign(data); + key.AddSign(weights); + key.AddSign(output); + if (bias) + key.AddSign(*bias); + auto it = fwds.find(key); if (it == fwds.end()) { MKLDNNConvForward fwd(param, is_train, data, weights, bias, output); auto ins_ret = fwds.insert( - std::pair(key, fwd)); + std::pair(key, fwd)); CHECK(ins_ret.second); it = ins_ret.first; } From 550e5e6c5abc1f0d207ec87766e35f11bbc98608 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 21 Dec 2017 18:59:02 +0000 Subject: [PATCH 223/264] Retrieve Act primitive based on its signature. --- src/operator/nn/activation-inl.h | 4 +++ src/operator/nn/mkldnn/mkldnn_act-inl.h | 44 +++++++++++++++++++++---- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index a1e2423ac0df..8368c9898502 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -61,6 +61,10 @@ struct ActivationParam : public dmlc::Parameter { .add_enum("softrelu", activation::kSoftReLU) .describe("Activation function to be applied."); } + + bool operator==(const ActivationParam& other) const { + return this->act_type == other.act_type; + } }; template diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index 664a27c99560..7389de4a87c1 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -69,12 +69,13 @@ static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { } } +typedef std::shared_ptr mkldnn_act_pdesc_ptr; + template -void MKLDNNActivationForward(const OpContext &ctx, const ActivationParam& param, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { - auto input_mem = in_data.GetMKLDNNData(); - mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); +mkldnn_act_pdesc_ptr GetActForwardDescImpl(const ActivationParam& param, + const OpContext &ctx, + const mkldnn::memory &input_mem) { + mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc(); mkldnn::memory::desc data_md = data_mpd.desc(); auto cpu_engine = data_mpd.get_engine(); Dtype alpha = 0; @@ -85,8 +86,39 @@ void MKLDNNActivationForward(const OpContext &ctx, const ActivationParam& param, alg, data_md, alpha) : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, alg, data_md, alpha); - mkldnn::eltwise_forward::primitive_desc pdesc(desc, cpu_engine); + return mkldnn_act_pdesc_ptr(new mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine)); +} +typedef MKLDNNParamOpSign MKLDNNActSignature; + +template +const mkldnn::eltwise_forward::primitive_desc &GetActForwardDesc( + const ActivationParam& param, const OpContext &ctx, const NDArray &in_data, + const mkldnn::memory &input_mem) { + static thread_local std::unordered_map descs; + MKLDNNActSignature key(param); + key.AddSign(ctx.is_train); + key.AddSign(param.act_type); + key.AddSign(in_data); + + auto it = descs.find(key); + if (it == descs.end()) { + auto desc = GetActForwardDescImpl(param, ctx, input_mem); + auto ins_ret = descs.insert( + std::pair(key, desc)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return *it->second; +} + +template +void MKLDNNActivationForward(const OpContext &ctx, const ActivationParam& param, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + auto input_mem = in_data.GetMKLDNNData(); + const mkldnn::eltwise_forward::primitive_desc &pdesc = GetActForwardDesc( + param, ctx, in_data, *input_mem); auto output_memory = const_cast(out_data).CreateMKLDNNData( pdesc.dst_primitive_desc()); MKLDNNStream *stream = MKLDNNStream::Get(); From 4122a7ac5594ea599a6e9d6c9af7497e4d2b8bbb Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 22 Dec 2017 09:35:12 +0000 Subject: [PATCH 224/264] Fix a bug in pooling. --- src/operator/nn/pooling.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 7a774f49de53..62717bf01747 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -397,6 +397,10 @@ height, width)*. .set_attr("FNumVisibleOutputs", [](const NodeAttrs& attrs) { return 1; }) #endif +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output"}; +}) .set_attr_parser(PoolingParamParser) .set_attr("FInferStorageType", PoolingStorageType) .set_attr("FInferType", PoolingType) From e2ab086d4aaef9f24b835263fafc79ed820341a2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 22 Dec 2017 09:35:23 +0000 Subject: [PATCH 225/264] Diable some MKLDNN activation and pooling. --- src/operator/nn/mkldnn/mkldnn_act-inl.h | 6 ++++-- src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h index 7389de4a87c1..982bf6ec0507 100644 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h @@ -46,11 +46,13 @@ namespace mxnet { namespace op { static inline bool SupportMKLDNNAct(const ActivationParam& param) { - // We don't include tanh for now. It seems MKLDNN tanh has some precision + // We only enable ReLU for now. It seems other activations have some precision // problems. - return param.act_type == activation::kReLU + return param.act_type == activation::kReLU; +#if 0 || param.act_type == activation::kSigmoid || param.act_type == activation::kSoftReLU; +#endif } static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h index 301517e1369e..6bae480b8c30 100644 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -35,8 +35,11 @@ namespace op { static inline bool SupportMKLDNNPooling(const PoolingParam ¶m) { return param.kernel.ndim() == 2 - && (param.pool_type == pool_enum::kMaxPooling + && (param.pool_type == pool_enum::kMaxPooling); +#if 0 + // It seems average pooling has precision problems. || param.pool_type == pool_enum::kAvgPooling); +#endif } static inline bool SupportMKLDNNPooling(const PoolingParam ¶m, From 80c43bd98a5f73e16eede406611e4a31ae1bd069 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 22 Dec 2017 23:51:11 +0000 Subject: [PATCH 226/264] Cast MKLDNN storage with diff data type. --- src/operator/tensor/cast_storage-inl.h | 5 ++-- src/operator/tensor/cast_storage.cc | 41 ++++++++++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index 41b4eaa1aeca..e113ee1befa4 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -325,7 +325,7 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, } #if MXNET_USE_MKLDNN == 1 -void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns); +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); #endif @@ -349,8 +349,7 @@ void CastStorageComputeImpl(const OpContext& ctx, CastStorageCsrDnsImpl(ctx, input, &ret); #if MXNET_USE_MKLDNN == 1 } else if (src_stype == kMKLDNNStorage && dst_stype == kDefaultStorage) { - TBlob ret = output.data(); - CastStorageMKLDnsImpl(ctx, input, &ret); + CastStorageMKLDnsImpl(ctx, input, output); } else if (src_stype == kDefaultStorage && dst_stype == kMKLDNNStorage) { CastStorageDnsMKLImpl(ctx, input, output); #endif diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 2e12f561e697..58578ea1b4e8 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -37,24 +37,33 @@ static inline int get_type_size(int dtype) { return -1; } -void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, TBlob* dns) { +void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst_arr) { + TBlob dns = dst_arr.data(); CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU); - CHECK(src.shape() == dns->shape_); - CHECK_EQ(src.dtype(), dns->type_flag_); - // This converts the source data to the default format and write the data to - // the destination directly. - std::vector net; - auto src_mkldnn = src.GetMKLDNNData(); - auto src_pd = src_mkldnn->get_primitive_desc(); - auto def_format = GetDefaultFormat(src_pd.desc()); - if (def_format != src_pd.desc().data.format) { - auto dst_pd = GetPrimitiveDesc(src_pd, def_format); - mkldnn::memory dst_mkldnn(dst_pd, dns->dptr_); - net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - } else { + CHECK(src.shape() == dns.shape_); + if (src.dtype() != dns.type_flag_) { + // If the input and output have different data types, we have to convert + // the source array into the default layout, cast the data type and copy + // data to the destination array. const TBlob &src_blob = src.data(); - memcpy(dns->dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns->type_flag_)); + CHECK(src.ctx() == dst_arr.ctx()); + ndarray::Copy(src.data(), &dns, src.ctx(), dst_arr.ctx(), ctx.run_ctx); + } else { + // This converts the source data to the default format and write the data to + // the destination directly. + std::vector net; + auto src_mkldnn = src.GetMKLDNNData(); + auto src_pd = src_mkldnn->get_primitive_desc(); + auto def_format = GetDefaultFormat(src_pd.desc()); + if (def_format != src_pd.desc().data.format) { + auto dst_pd = GetPrimitiveDesc(src_pd, def_format); + mkldnn::memory dst_mkldnn(dst_pd, dns.dptr_); + net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + } else { + const TBlob &src_blob = src.data(); + memcpy(dns.dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns.type_flag_)); + } } } From 4fd62cc9795472ac588c79871feece8986087a35 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 23 Dec 2017 00:48:29 +0000 Subject: [PATCH 227/264] Check if it's a view of NDArray. --- include/mxnet/ndarray.h | 6 ++++++ src/ndarray/ndarray.cc | 26 +++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index e4c832a36afd..88c6ba7b9417 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -139,6 +139,12 @@ class NDArray { dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { } + inline bool is_view() const { + // Sparse arrays don't have a view. + if (storage_type() == kRowSparseStorage || storage_type() == kCSRStorage) + return false; + return byte_offset_ > 0 || shape() != ptr_->storage_shape; + } /*! * \return the shape of current NDArray. diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e7232dc438a6..231557077794 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -98,15 +98,16 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c } } if (storage_shape.Size() == 0 -#if MXNET_USE_MKLDNN == 1 - && stype != kMKLDNNStorage -#endif && stype != kDefaultStorage) { if (stype == kRowSparseStorage) { storage_shape = shape; storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; } else if (stype == kCSRStorage) { storage_shape = aux_shapes[csr::kIdx]; +#if MXNET_USE_MKLDNN == 1 + } else if (stype == kMKLDNNStorage) { + storage_shape = shape; +#endif } else { LOG(FATAL) << "Unknown storage type " << stype; } @@ -193,8 +194,6 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { } else if (storage_type() == kMKLDNNStorage) { NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); CHECK(ptr_->Mkl_mem_ != nullptr); - // This doesn't work on sliced NDArray yet. - CHECK_EQ(byte_offset_, 0); // We shouldn't submit the reorder primitive here because submit will // be called in operators. auto format = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc()); @@ -213,6 +212,7 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, EmptyMKLDNNDeleter()); } + ret.byte_offset_ = byte_offset_; return ret; } LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; @@ -553,11 +553,12 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { ptr_->SetMKLMem(shape_, dtype_); CHECK(ptr_->Mkl_mem_ != nullptr); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); - if (byte_offset_ > 0) { - // Slice only works on the default layout and Slice() turns an array into - // the default layout. - auto pd = ptr_->Mkl_mem_->get_primitive_desc(); + auto pd = ptr_->Mkl_mem_->get_primitive_desc(); + if (is_view()) { + // Sliced array must use the default layout. CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); + } + if (byte_offset_ > 0) { void *off_addr = static_cast(ptr_->Mkl_mem_->get_data_handle()) + byte_offset_; @@ -595,14 +596,16 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { return; } - // This doesn't work on sliced NDArray yet. - CHECK_EQ(byte_offset_, 0); MKLDNNStream *stream = MKLDNNStream::Get(); ptr_->SetMKLMem(shape_, dtype_); stream->RegisterMem(ptr_->Mkl_mem_); auto from_desc = mem.get_primitive_desc().desc(); auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); auto from_def_format = GetDefaultFormat(from_desc); + if (is_view()) { + // Sliced array must use the default layout. + CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format); + } // It's possible that the memory and the NDArray don't have the same shape. if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims) // If the source memory uses the default layout, we can reshape directly. @@ -713,6 +716,7 @@ void NDArray::SetTBlob() const { else ptr_->SetMKLMem(shape_, dtype_); dptr = static_cast(ptr_->Mkl_mem_->get_data_handle()); + dptr += byte_offset_; #endif } else { LOG(FATAL) << "unknown storage type " << stype; From e9e47c7f50b553dc7ee5eda8a9e240b3e41325a9 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 23 Dec 2017 00:57:30 +0000 Subject: [PATCH 228/264] Reshaped and sliced arrays share the same chunks. --- src/ndarray/ndarray.cc | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 231557077794..96363eb7ba80 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -231,20 +231,17 @@ NDArray NDArray::Reshape(const TShape &shape) const { return ret; #if MXNET_USE_MKLDNN == 1 } else if (storage_type() == kMKLDNNStorage) { - NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); + NDArray ret = this->Detach(); + ret.shape_ = shape; // We need to convert the MKL memory to the default layout. Engine::Get()->PushSync([&](RunContext ctx) { if (this->ptr_->Mkl_mem_) { auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } else { - ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; } - // We should make sure slice still works. - ret.byte_offset_ = this->byte_offset_; } - }, ctx(), {this->var()}, {ret.var()}, + }, ctx(), {}, {ret.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); ret.WaitToRead(); return ret; @@ -280,9 +277,8 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { #if MXNET_USE_MKLDNN == 1 CHECK(storage_type() == kDefaultStorage || storage_type() == kMKLDNNStorage); if (storage_type() == kMKLDNNStorage) { - TShape new_shape = shape_; - new_shape[0] = end - begin; - NDArray ret(kMKLDNNStorage, new_shape, ctx(), ptr_->delay_alloc, dtype()); + NDArray ret = this->Detach(); + ret.shape_[0] = end - begin; size_t length = shape_.ProdShape(1, shape_.ndim()); MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); @@ -293,10 +289,8 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } else { - ret.ptr_->Mkl_mem_ = this->ptr_->Mkl_mem_; } - }, ctx(), {this->var()}, {ret.var()}, + }, ctx(), {}, {ret.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); ret.WaitToRead(); return ret; From a1423efc475ab1d9d96dc8d4ab852bc465c98571 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 23 Dec 2017 06:50:05 +0000 Subject: [PATCH 229/264] Implement caching MKLDNN Act correctly. --- src/operator/nn/activation.cc | 8 +- src/operator/nn/mkldnn/mkldnn_act-inl.h | 173 --------------- src/operator/nn/mkldnn/mkldnn_act.cc | 213 +++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_base-inl.h | 26 +++ src/operator/nn/mkldnn/mkldnn_convolution.cc | 3 + src/operator/nn/mkldnn/mkldnn_ops-inl.h | 8 + 6 files changed, 254 insertions(+), 177 deletions(-) delete mode 100644 src/operator/nn/mkldnn/mkldnn_act-inl.h create mode 100644 src/operator/nn/mkldnn/mkldnn_act.cc diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index b5386babc610..bed54ea4cdd1 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -28,7 +28,7 @@ #include "../tensor/elemwise_unary_op.h" #if MXNET_USE_MKLDNN == 1 #include "./mkldnn/mkldnn_base-inl.h" -#include "./mkldnn/mkldnn_act-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" #endif // MXNET_USE_MKLDNN namespace mxnet { @@ -60,7 +60,7 @@ static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNActivationForward(ctx, param, inputs[0], req[0], outputs[0]); + MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]); return; } #endif @@ -80,8 +80,8 @@ void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs, const ActivationParam& param = nnvm::get(attrs.parsed); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNActivationBackward(ctx, param, inputs[0], inputs[1], req[0], - outputs[0]); + MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0], + outputs[0]); return; } #endif diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h deleted file mode 100644 index 982bf6ec0507..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_act-inl.h +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_act-inl.h - * \brief - * \author Da Zheng -*/ - -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../../operator_common.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 - -#include - -namespace mxnet { -namespace op { - -static inline bool SupportMKLDNNAct(const ActivationParam& param) { - // We only enable ReLU for now. It seems other activations have some precision - // problems. - return param.act_type == activation::kReLU; -#if 0 - || param.act_type == activation::kSigmoid - || param.act_type == activation::kSoftReLU; -#endif -} - -static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { - switch (param.act_type) { - case activation::kReLU: - return mkldnn::algorithm::eltwise_relu; - case activation::kSigmoid: - return mkldnn::algorithm::eltwise_logistic; - case activation::kTanh: - return mkldnn::algorithm::eltwise_tanh; - case activation::kSoftReLU: - return mkldnn::algorithm::eltwise_soft_relu; - default: - LOG(FATAL) << "unknown activation type"; - return mkldnn::algorithm::eltwise_relu; - } -} - -typedef std::shared_ptr mkldnn_act_pdesc_ptr; - -template -mkldnn_act_pdesc_ptr GetActForwardDescImpl(const ActivationParam& param, - const OpContext &ctx, - const mkldnn::memory &input_mem) { - mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc(); - mkldnn::memory::desc data_md = data_mpd.desc(); - auto cpu_engine = data_mpd.get_engine(); - Dtype alpha = 0; - - auto alg = GetMKLDNNActAlgo(param); - mkldnn::eltwise_forward::desc desc = ctx.is_train - ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, - alg, data_md, alpha) - : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, - alg, data_md, alpha); - return mkldnn_act_pdesc_ptr(new mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine)); -} - -typedef MKLDNNParamOpSign MKLDNNActSignature; - -template -const mkldnn::eltwise_forward::primitive_desc &GetActForwardDesc( - const ActivationParam& param, const OpContext &ctx, const NDArray &in_data, - const mkldnn::memory &input_mem) { - static thread_local std::unordered_map descs; - MKLDNNActSignature key(param); - key.AddSign(ctx.is_train); - key.AddSign(param.act_type); - key.AddSign(in_data); - - auto it = descs.find(key); - if (it == descs.end()) { - auto desc = GetActForwardDescImpl(param, ctx, input_mem); - auto ins_ret = descs.insert( - std::pair(key, desc)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return *it->second; -} - -template -void MKLDNNActivationForward(const OpContext &ctx, const ActivationParam& param, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { - auto input_mem = in_data.GetMKLDNNData(); - const mkldnn::eltwise_forward::primitive_desc &pdesc = GetActForwardDesc( - param, ctx, in_data, *input_mem); - auto output_memory = const_cast(out_data).CreateMKLDNNData( - pdesc.dst_primitive_desc()); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::eltwise_forward(pdesc, *input_mem, *output_memory)); - stream->Submit(); -} - -template -void MKLDNNActivationBackward(const OpContext &ctx, const ActivationParam& param, - const NDArray &out_grad, const NDArray &in_data, - const OpReqType &req, const NDArray &in_grad) { - if (req == kNullOp) { - return; - } - - TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); - auto diff_dst_memory = out_grad.GetMKLDNNData(); - auto input_mem = in_data.GetMKLDNNData(); - // We need to make sure the two inputs to eltwise_backward has the same memory - // descriptor. Otherwise, the perf will suffer. - if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc()) - input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc()); - mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); - mkldnn::memory::desc data_md = data_mpd.desc(); - mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); - auto cpu_engine = data_mpd.get_engine(); - Dtype alpha = 0; - - auto alg = GetMKLDNNActAlgo(param); - mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, - alg, data_md, alpha); - mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); - mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); - mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc); - - auto diff_src_memory = CreateMKLDNNMem(in_grad, - bw_pdesc.diff_src_primitive_desc(), req); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, - *diff_dst_memory, - *diff_src_memory.second)); - CommitOutput(in_grad, diff_src_memory); - stream->Submit(); -} - -} // namespace op -} // namespace mxnet - -#endif -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc new file mode 100644 index 000000000000..3c6606122c20 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_act.cc @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_act.cc + * \brief + * \author Da Zheng +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../operator_common.h" +#include "../activation-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 + +#include + +namespace mxnet { +namespace op { + +bool SupportMKLDNNAct(const ActivationParam& param) { + // We only enable ReLU for now. It seems other activations have some precision + // problems. + return param.act_type == activation::kReLU; +#if 0 + || param.act_type == activation::kSigmoid + || param.act_type == activation::kSoftReLU; +#endif +} + +static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { + switch (param.act_type) { + case activation::kReLU: + return mkldnn::algorithm::eltwise_relu; + case activation::kSigmoid: + return mkldnn::algorithm::eltwise_logistic; + case activation::kTanh: + return mkldnn::algorithm::eltwise_tanh; + case activation::kSoftReLU: + return mkldnn::algorithm::eltwise_soft_relu; + default: + LOG(FATAL) << "unknown activation type"; + return mkldnn::algorithm::eltwise_relu; + } +} + +typedef std::shared_ptr mkldnn_act_pdesc_ptr; + +static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl( + const ActivationParam& param, bool is_train, + const mkldnn::memory &input_mem, int dtype) { + mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + DType alpha = 0; + auto alg = GetMKLDNNActAlgo(param); + mkldnn::eltwise_forward::desc desc = is_train + ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha) + : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, + alg, data_md, alpha); + return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); + }); +} + +typedef MKLDNNParamOpSign MKLDNNActSignature; + +class MKLDNNActForward { + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr out; + + public: + const mkldnn::eltwise_forward::primitive_desc fwd_pd; + + MKLDNNActForward(const ActivationParam& param, bool is_train, + const NDArray &data, const mkldnn::memory &mem): fwd_pd( + GetActFwdDescImpl(param, is_train, mem, data.dtype())) { + } + + void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) { + if (this->data == nullptr) + this->data = std::shared_ptr(new mkldnn::memory( + data.get_primitive_desc(), data.get_data_handle())); + else + this->data->set_data_handle(data.get_data_handle()); + + CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc()); + if (this->out == nullptr) + this->out = std::shared_ptr(new mkldnn::memory( + fwd_pd.dst_primitive_desc(), output.get_data_handle())); + else + this->out->set_data_handle(output.get_data_handle()); + + if (this->fwd == nullptr) { + this->fwd = std::shared_ptr( + new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data), + *this->out)); + } + } + + const mkldnn::eltwise_forward &GetFwd() const { + return *fwd; + } +}; + +static MKLDNNActForward &GetActForward(const ActivationParam& param, + const OpContext &ctx, const NDArray &in_data, + const mkldnn::memory &in_mem) { + static thread_local std::unordered_map fwds; + MKLDNNActSignature key(param); + key.AddSign(ctx.is_train); + key.AddSign(param.act_type); + key.AddSign(in_mem); + + auto it = fwds.find(key); + if (it == fwds.end()) { + MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem); + auto ins_ret = fwds.insert(std::pair( + key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + const ActivationParam& param = nnvm::get(attrs.parsed); + auto input_mem = in_data.GetMKLDNNData(); + MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem); + auto out_mem = const_cast(out_data).CreateMKLDNNData( + fwd.fwd_pd.dst_primitive_desc()); + fwd.SetNewMem(*input_mem, *out_mem); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(fwd.GetFwd()); + stream->Submit(); +} + +void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + const ActivationParam& param = nnvm::get(attrs.parsed); + TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); + auto diff_dst_memory = out_grad.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); + // We need to make sure the two inputs to eltwise_backward has the same memory + // descriptor. Otherwise, the perf will suffer. + if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc()) + input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc()); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); + auto cpu_engine = data_mpd.get_engine(); + + MKLDNNStream *stream = MKLDNNStream::Get(); + auto alg = GetMKLDNNActAlgo(param); + mkldnn_output_t diff_src_memory; + + MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, { + DType alpha = 0; + mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); + mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); + mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, + fw_pdesc); + + diff_src_memory = CreateMKLDNNMem(in_grad, + bw_pdesc.diff_src_primitive_desc(), req); + stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + *diff_dst_memory, + *diff_src_memory.second)); + }); + CommitOutput(in_grad, diff_src_memory); + stream->Submit(); +} + +} // namespace op +} // namespace mxnet + +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 10bb36170e41..b7603d8ba99b 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -140,6 +140,11 @@ static inline bool SupportMKLDNNConv(const NDArray &input) { return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4; } +namespace op { +struct ActivationParam; +bool SupportMKLDNNAct(const op::ActivationParam& param); +} + static int GetTypeSize(int dtype) { MSHADOW_TYPE_SWITCH(dtype, DType, { return sizeof(DType); @@ -275,6 +280,27 @@ class MKLDNNOpSignature { std::vector eles; uint64_t hash; public: + /* + * We provide different methods to add signature to an op. + * For operations, such as convolutin and fully connected, which determines + * the optimal data layout for the op, we only need to use the shape and data + * type to sign the op. For other operations, such as activation, which uses + * whatever layout in the input array, we have to use the shape, the data type + * and the layout to sign the op. + */ + + void AddSign(const mkldnn::memory &mem) { + auto desc = mem.get_primitive_desc().desc(); + hash = hash * 2 + desc.data.format; + eles.push_back(desc.data.format); + hash = hash * 2 + desc.data.data_type; + eles.push_back(desc.data.data_type); + for (int i = 0; i < desc.data.ndims; i++) { + hash = hash * 2 + desc.data.dims[i]; + eles.push_back(desc.data.dims[i]); + } + } + void AddSign(const std::vector &arrs) { for (auto &arr : arrs) { AddSign(arr); diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 9ceeacddcaa9..c3fdc778724f 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -236,6 +236,9 @@ static inline MKLDNNConvForward &GetConvFwd( const ConvolutionParam& param = nnvm::get(attrs.parsed); MKLDNNConvSignature key(param); key.AddSign(is_train); + // Here we can sign the conv op with NDArray because conv primitive will + // decide the right layout for the, so we only need to get the shape and the + // data type of the arrays. key.AddSign(data); key.AddSign(weights); key.AddSign(output); diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 820197efa0bb..9149cb0c6a94 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -96,6 +96,14 @@ void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector& req, const std::vector& outputs); +/* For activation */ +void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); +void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad); + void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, const mkldnn::memory &out); From e2c5374fae029f2966090e3feea2c1f1ec8c4e83 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 06:18:24 +0000 Subject: [PATCH 230/264] Fix a bug in check_consistency. --- python/mxnet/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index 0dfeec56c1e7..304173663124 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -1282,6 +1282,10 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write', arr[:] = arg_params[name] for name, arr in exe.aux_dict.items(): arr[:] = aux_params[name] + # We need to initialize the gradient arrays if it's add. + if (grad_req == "add"): + for arr in exe.grad_arrays: + arr[:] = np.zeros(arr.shape, dtype=arr.dtype) dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list] max_idx = np.argmax(dtypes) From fb040e64ffbae7fcf412a8b4fce3f718ec645509 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 06:53:35 +0000 Subject: [PATCH 231/264] Fix a potential bug when destroying NDArray. --- include/mxnet/ndarray.h | 18 +++--------------- src/ndarray/ndarray.cc | 25 ++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 88c6ba7b9417..8dc757bb9f03 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -140,8 +140,8 @@ class NDArray { } inline bool is_view() const { - // Sparse arrays don't have a view. - if (storage_type() == kRowSparseStorage || storage_type() == kCSRStorage) + // Sparse arrays don't have a view. + if (storage_type() == kRowSparseStorage || storage_type() == kCSRStorage) return false; return byte_offset_ > 0 || shape() != ptr_->storage_shape; } @@ -857,19 +857,7 @@ class NDArray { set_aux_shape(i, shape); } /*! \brief destructor */ - ~Chunk() { - bool skip_free = static_data || delay_alloc; - Storage::Handle h = this->shandle; - std::vector aux_h = this->aux_handles; - Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) { - if (skip_free == false) { - Storage::Get()->Free(h); - for (size_t i = 0; i < aux_h.size(); i++) { - if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]); - } - } - }, shandle.ctx, var); - } + ~Chunk(); }; // struct Chunk void SetTBlob() const; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 96363eb7ba80..51b8cbbf04aa 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -105,7 +105,7 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c } else if (stype == kCSRStorage) { storage_shape = aux_shapes[csr::kIdx]; #if MXNET_USE_MKLDNN == 1 - } else if (stype == kMKLDNNStorage) { + } else if (stype == kMKLDNNStorage) { storage_shape = shape; #endif } else { @@ -119,6 +119,29 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c dtype, aux_types, aux_shapes); } +struct ChunkMem { + Storage::Handle h; + std::vector aux_h; + std::shared_ptr mem; +}; + +NDArray::Chunk::~Chunk() { + bool skip_free = static_data || delay_alloc; + ChunkMem mem; + mem.h = this->shandle; + mem.aux_h = this->aux_handles; + // We want to delete mkldnn memory after deleting the variable. + mem.mem = this->Mkl_mem_; + Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { + if (skip_free == false) { + if (mem.h.size > 0) Storage::Get()->Free(mem.h); + for (size_t i = 0; i < mem.aux_h.size(); i++) { + if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]); + } + } + }, shandle.ctx, var); +} + void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { #if MXNET_USE_MKLDNN == 1 if (storage_type == kMKLDNNStorage) { From a10075a622b1305bffb73fc9610c73a8db834810 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 07:10:06 +0000 Subject: [PATCH 232/264] Fix bugs when allocating mem in NDArray. --- src/ndarray/ndarray.cc | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 51b8cbbf04aa..26517f3fd4c2 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -146,18 +146,20 @@ void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { #if MXNET_USE_MKLDNN == 1 if (storage_type == kMKLDNNStorage) { SetMKLMem(shape, dtype); - return; - } + } else { #endif - CHECK_NE(aux_shapes.size(), 0) - << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } +#if MXNET_USE_MKLDNN == 1 } +#endif // init shape storage_shape = shape; // delay_alloc is only set when data storage handle is present @@ -215,7 +217,7 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { ret.shape_ = shape; return ret; } else if (storage_type() == kMKLDNNStorage) { - NDArray ret(kMKLDNNStorage, shape, ctx(), ptr_->delay_alloc, dtype()); + NDArray ret(kMKLDNNStorage, shape, ctx(), true, dtype()); CHECK(ptr_->Mkl_mem_ != nullptr); // We shouldn't submit the reorder primitive here because submit will // be called in operators. From 86945b0afc21a00a92b7c73d599df86eed7f8fdd Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 07:10:40 +0000 Subject: [PATCH 233/264] Fix coding style. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 +++- src/operator/tensor/cast_storage.cc | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index b7603d8ba99b..480537d50e7c 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -279,6 +279,7 @@ class MKLDNNStream { class MKLDNNOpSignature { std::vector eles; uint64_t hash; + public: /* * We provide different methods to add signature to an op. @@ -350,8 +351,9 @@ struct MKLDNNOpHash { template class MKLDNNParamOpSign: public MKLDNNOpSignature { const ParamType param; + public: - MKLDNNParamOpSign(const ParamType &_param): param(_param) { + explicit MKLDNNParamOpSign(const ParamType &_param): param(_param) { } bool operator==(const MKLDNNParamOpSign &sign) const { diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 58578ea1b4e8..80d123955369 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -43,11 +43,11 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArr CHECK(src.shape() == dns.shape_); if (src.dtype() != dns.type_flag_) { // If the input and output have different data types, we have to convert - // the source array into the default layout, cast the data type and copy - // data to the destination array. + // the source array into the default layout, cast the data type and copy + // data to the destination array. const TBlob &src_blob = src.data(); - CHECK(src.ctx() == dst_arr.ctx()); - ndarray::Copy(src.data(), &dns, src.ctx(), dst_arr.ctx(), ctx.run_ctx); + CHECK(src.ctx() == dst_arr.ctx()); + ndarray::Copy(src.data(), &dns, src.ctx(), dst_arr.ctx(), ctx.run_ctx); } else { // This converts the source data to the default format and write the data to // the destination directly. From 69c39acbd9627f46954e682a0457ef7317df4586 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 07:59:27 +0000 Subject: [PATCH 234/264] Add micro when using mkldnn in ndarray. --- src/ndarray/ndarray.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 26517f3fd4c2..1eb66e1e7774 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -122,7 +122,9 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c struct ChunkMem { Storage::Handle h; std::vector aux_h; +#if MXNET_USE_MKLDNN == 1 std::shared_ptr mem; +#endif }; NDArray::Chunk::~Chunk() { @@ -130,8 +132,10 @@ NDArray::Chunk::~Chunk() { ChunkMem mem; mem.h = this->shandle; mem.aux_h = this->aux_handles; +#if MXNET_USE_MKLDNN == 1 // We want to delete mkldnn memory after deleting the variable. mem.mem = this->Mkl_mem_; +#endif Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { if (skip_free == false) { if (mem.h.size > 0) Storage::Get()->Free(mem.h); From 5cd1adf649faec9b86cbac0a3f4221349686ec5f Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 24 Dec 2017 08:26:51 +0000 Subject: [PATCH 235/264] Fix a compilation error. --- src/operator/nn/mkldnn/mkldnn_act.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc index 3c6606122c20..7783ddd5fa4e 100644 --- a/src/operator/nn/mkldnn/mkldnn_act.cc +++ b/src/operator/nn/mkldnn/mkldnn_act.cc @@ -77,9 +77,9 @@ static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl( mkldnn::memory::desc data_md = data_mpd.desc(); auto cpu_engine = data_mpd.get_engine(); + auto alg = GetMKLDNNActAlgo(param); MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { DType alpha = 0; - auto alg = GetMKLDNNActAlgo(param); mkldnn::eltwise_forward::desc desc = is_train ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, alg, data_md, alpha) @@ -87,6 +87,10 @@ static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl( alg, data_md, alpha); return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); }); + LOG(INFO) << "Unsupported data type for MKLDNN activation"; + mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc( + mkldnn::prop_kind::forward_training, alg, data_md, 0.0); + return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); } typedef MKLDNNParamOpSign MKLDNNActSignature; From f20f4c9d3a50490346a0b8f8f7315d7a2b3a3cdd Mon Sep 17 00:00:00 2001 From: Lv Tao Date: Sat, 30 Dec 2017 16:25:54 +0800 Subject: [PATCH 236/264] Add primitive and memory cache for BatchNorm --- src/operator/nn/batch_norm-inl.h | 10 + .../nn/mkldnn/mkldnn_batch_norm-inl.h | 321 ++++++++++++------ 2 files changed, 236 insertions(+), 95 deletions(-) diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index ccedd4685f5c..a07ff40a65f6 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -85,6 +85,16 @@ struct BatchNormParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(cudnn_off).set_default(false) .describe("Do not select CUDNN operator, if available"); } + + bool operator==(const BatchNormParam& other) const { + return this->eps == other.eps && + this->momentum == other.momentum && + this->fix_gamma == other.fix_gamma && + this->use_global_stats == other.use_global_stats && + this->output_mean_var == other.output_mean_var && + this->axis == other.axis && + this->cudnn_off == other.cudnn_off; + } }; static inline bool IsBNWriting(const OpReqType ort) { diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 6332f0a90f99..512d5215ba16 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -18,9 +18,9 @@ */ /*! - * \file mkldnn_batch_norm.cc + * \file mkldnn_batch_norm-inl.h * \brief - * \author Tao Lv + * \author Tao Lv (tao.a.lv@intel.com) */ #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ @@ -28,6 +28,7 @@ #if MXNET_USE_MKLDNN == 1 #include +#include #include #include "../batch_norm-inl.h" #include "./mkldnn_ops-inl.h" @@ -42,6 +43,7 @@ typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc; typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc; typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc; typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc; +typedef MKLDNNParamOpSign MKLDNNBNSignature; using mkldnn::use_global_stats; using mkldnn::use_scale_shift; @@ -56,8 +58,6 @@ inline static unsigned _GetFlags(const std::vector &in_data, flags |= use_scale_shift; } - // aux_states[0]: inMean - // aux_states[1]: inVariance if (aux_states.size() == 2U && !is_train) { flags |= use_global_stats; } @@ -98,115 +98,246 @@ inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, } template -void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); - unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); - const NDArray &data = in_data[batchnorm::kData]; +class MKLDNNBNForward { + public: + MKLDNNBNForward(const mxnet::NDArray &data, DType eps, + bool is_train, bool scale_shift, + bool global_stats, bool fix_gamma) : + _out_mean(nullptr), _out_var(nullptr), + _flag(0U), _fix_gamma(fix_gamma), _is_train(is_train), + _channels(data.shape()[1]), _eps(eps), + fwd(nullptr), data(nullptr), weight(nullptr), + out(nullptr), mean(nullptr), variance(nullptr) { + _Init(data, scale_shift, global_stats); + } - auto data_mem = data.GetMKLDNNData(); - auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, (DType) param.eps, flags); - const NDArray &out = out_data[batchnorm::kOut]; + ~MKLDNNBNForward() {} + + void SetDataHandle(const std::vector &req, + const mxnet::NDArray &data, + const mxnet::NDArray &output, + const mxnet::TBlob &moving_mean, + const mxnet::TBlob &moving_var, + const mxnet::TBlob &out_mean, + const mxnet::TBlob &out_var, + const mxnet::TBlob *gamma = nullptr, + const mxnet::TBlob *beta = nullptr); + + void Execute(); + + private: + DType *_out_mean; + DType *_out_var; + unsigned _flag; + bool _fix_gamma; + bool _is_train; + nnvm::dim_t _channels; + DType _eps; + + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr weight; + std::shared_ptr out; + std::shared_ptr mean; + std::shared_ptr variance; + + private: + void _Init(const mxnet::NDArray &data, bool scale_shift, bool global_stats); + void _SetWeight(const mxnet::TBlob &gamma, + const mxnet::TBlob &beta, + const OpReqType &req); + void _SetMeanVar(const DType *imean, + const DType *ivar, + DType *omean, + DType *ovar); +}; - // for output memory - auto out_mem = const_cast(out).CreateMKLDNNData(fwd_pd.dst_primitive_desc()); +template +void MKLDNNBNForward::_Init(const mxnet::NDArray &src, bool scale_shift, bool global_stats) { + this->_flag |= scale_shift ? use_scale_shift : 0U; + this->_flag |= global_stats ? use_global_stats : 0U; - // mxnet will always use scale shift. - // But if fix_gamma is true, then all scale elements will be set to 1.0f - if (flags & use_scale_shift) { - const NDArray &gamma = in_data[batchnorm::kGamma]; - const NDArray &beta = in_data[batchnorm::kBeta]; - CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage); + auto src_md = src.GetMKLDNNData()->get_primitive_desc().desc(); + auto engine = CpuEngine::Get()->get_engine(); - // TODO(tao): how to reuse this memory? - std::shared_ptr weight_mem( - new mkldnn::memory(fwd_pd.weights_primitive_desc())); - DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); + mkldnn::prop_kind prop = forward_training; + if (this->_is_train) { + prop = forward_training; + } else { + prop = forward_inference; + } - nnvm::dim_t channels_ = data.shape()[1]; - DType* weight_ptr = gamma.data().dptr(); - DType* bias_ptr = beta.data().dptr(); - if (!param.fix_gamma) { + auto fwd_desc = t_bn_f_desc(prop, src_md, this->_eps, this->_flag); + auto fwd_pd = t_bn_f_pdesc(fwd_desc, engine); + + this->data.reset(new mkldnn::memory(src.GetMKLDNNData()->get_primitive_desc())); + this->out.reset(new mkldnn::memory(fwd_pd.dst_primitive_desc())); + + if (this->_flag & use_scale_shift) { + this->weight.reset(new memory(fwd_pd.weights_primitive_desc())); + } + + if (this->_is_train || (this->_flag & use_global_stats)) { + this->mean.reset(new mkldnn::memory(fwd_pd.mean_primitive_desc())); + this->variance.reset(new mkldnn::memory(fwd_pd.variance_primitive_desc())); + } + + // for mxnet, there always has weight + CHECK_EQ(this->_flag & use_scale_shift, use_scale_shift); + if (!(this->_is_train)) { + this->fwd.reset( + new mkldnn::batch_normalization_forward(fwd_pd, + *(this->data), + mkldnn::primitive::at(*(this->mean)), + mkldnn::primitive::at(*(this->variance)), + mkldnn::primitive::at(*(this->weight)), + *(this->out))); + } else { + this->fwd.reset( + new mkldnn::batch_normalization_forward(fwd_pd, + *(this->data), + mkldnn::primitive::at(*(this->weight)), + *(this->out), + *(this->mean), + *(this->variance))); + } + return; +} + +template +void MKLDNNBNForward::SetDataHandle(const std::vector &req, + const mxnet::NDArray &data, + const mxnet::NDArray &output, + const mxnet::TBlob &moving_mean, + const mxnet::TBlob &moving_var, + const mxnet::TBlob &out_mean, + const mxnet::TBlob &out_var, + const mxnet::TBlob *gamma, + const mxnet::TBlob *beta) { + auto data_mem = data.GetMKLDNNData(); + auto out_mem = const_cast(output).CreateMKLDNNData(this->out->get_primitive_desc()); + this->data->set_data_handle(data_mem->get_data_handle()); + this->out->set_data_handle(out_mem->get_data_handle()); + + // weights + if (gamma != nullptr && beta != nullptr && (this->_flag | use_scale_shift)) { + _SetWeight(*gamma, *beta, req[batchnorm::kGamma]); + } + + // mean and variance + this->_out_mean = out_mean.dptr(); + this->_out_var = out_var.dptr(); + if (!(this->_is_train)) { + this->mean->set_data_handle(moving_mean.dptr()); + this->variance->set_data_handle(moving_var.dptr()); + } else { + this->mean->set_data_handle(this->_out_mean); + this->variance->set_data_handle(this->_out_var); + } +} + +template +void MKLDNNBNForward::Execute() { + if (!(this->_is_train)) { + MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); + MKLDNNStream::Get()->Submit(); + _SetMeanVar(reinterpret_cast(this->mean->get_data_handle()), + reinterpret_cast(this->variance->get_data_handle()), + this->_out_mean, this->_out_var); + } else { + MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); + MKLDNNStream::Get()->Submit(); + _SetMeanVar(reinterpret_cast(this->mean->get_data_handle()), + reinterpret_cast(this->variance->get_data_handle()), + this->_out_mean, this->_out_var); + } +} + +template +void MKLDNNBNForward::_SetWeight(const mxnet::TBlob &gamma, + const mxnet::TBlob &beta, + const OpReqType &req) { + // CHECK_NE(this->weight, nullptr); + DType *gamma_ptr = gamma.dptr(); + DType *beta_ptr = beta.dptr(); + DType *weight_ptr = reinterpret_cast(this->weight->get_data_handle()); + + if (!(this->_fix_gamma)) { #pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = weight_ptr[i]; - weight_buf[channels_ + i] = bias_ptr[i]; // bias + for (int i = 0; i < this->_channels; i++) { + weight_ptr[i] = gamma_ptr[i]; + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias } - } else if (IsBNWriting(req[batchnorm::kGamma])) { + } else if (IsBNWriting(req)) { #pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = (DType)1.0f; + for (int i = 0; i < this->_channels; i++) { weight_ptr[i] = (DType)1.0f; - weight_buf[channels_ + i] = bias_ptr[i]; // bias + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias + gamma_ptr[i] = (DType)1.0f; } } else { #pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = (DType)1.0f; - weight_buf[channels_ + i] = bias_ptr[i]; // bias + for (int i = 0; i < this->_channels; i++) { + weight_ptr[i] = (DType)1.0f; + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias } } +} - if (!ctx.is_train) { - DType* omean = out_data[batchnorm::kMean].data().dptr(); - DType* ovar = out_data[batchnorm::kVar].data().dptr(); - DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); - DType* invar = aux_states[batchnorm::kMovingVar].data().dptr(); - // to align with origin implmentation: batch_norm.cc: L164 -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - omean[i] = inmean[i]; - ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps); - } - - std::shared_ptr mean_m( - new mkldnn::memory(fwd_pd.mean_primitive_desc(), inmean)); - std::shared_ptr var_m( - new mkldnn::memory(fwd_pd.variance_primitive_desc(), invar)); - auto bn = mkldnn::batch_normalization_forward(fwd_pd, - *data_mem, - mkldnn::primitive::at(*mean_m), - mkldnn::primitive::at(*var_m), - *weight_mem, - *out_mem); - MKLDNNStream::Get()->RegisterPrim(bn); - MKLDNNStream::Get()->Submit(); - } else { // training - const NDArray &outMean = out_data[batchnorm::kMean]; - const NDArray &outVar = out_data[batchnorm::kVar]; - CHECK_EQ(outMean.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(outVar.storage_type(), mxnet::kDefaultStorage); - DType* omean = out_data[batchnorm::kMean].data().dptr(); - DType* ovar = out_data[batchnorm::kVar].data().dptr(); - - std::shared_ptr mean_mem( - new mkldnn::memory(fwd_pd.mean_primitive_desc(), omean)); - std::shared_ptr var_mem( - new mkldnn::memory(fwd_pd.variance_primitive_desc(), ovar)); - - auto bn = mkldnn::batch_normalization_forward(fwd_pd, - mkldnn::primitive::at(*data_mem), - mkldnn::primitive::at(*weight_mem), - *out_mem, - *mean_mem, - *var_mem); - MKLDNNStream::Get()->RegisterPrim(bn); - MKLDNNStream::Get()->Submit(); - DType* mean_mem_ptr = reinterpret_cast(mean_mem->get_data_handle()); - DType* var_mem_ptr = reinterpret_cast(var_mem->get_data_handle()); +template +void MKLDNNBNForward::_SetMeanVar(const DType *imean, + const DType *ivar, + DType *omean, + DType *ovar) { #pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - omean[i] = mean_mem_ptr[i]; - ovar[i] = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps); - } + for (int i = 0; i < this->_channels; i++) { + omean[i] = imean[i]; + ovar[i] = VARIANCE_TO_INVSTD(ivar[i], this->_eps); } - } else { // no input gamma and beta - LOG(FATAL) << "MKLDNN batch normalization: should not reach here ..."; +} + +template +static inline MKLDNNBNForward &GetBNFwd(const BatchNormParam ¶m, + bool is_train, + const NDArray &data) { + static thread_local std::unordered_map, + MKLDNNOpHash> fwds; + MKLDNNBNSignature key(param); + key.AddSign(is_train); + key.AddSign(data); + + auto it = fwds.find(key); + if (it == fwds.end()) { + MKLDNNBNForward fwd(data, param.eps, is_train, true, + param.use_global_stats, param.fix_gamma); + auto ins_ret = fwds.insert(std::pair >(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; } + return it->second; +} + +template +void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); + const NDArray &data = in_data[batchnorm::kData]; + auto gamma = in_data[batchnorm::kGamma].data(); + auto beta = in_data[batchnorm::kBeta].data(); + auto moving_mean = aux_states[batchnorm::kMovingMean].data(); + auto moving_var = aux_states[batchnorm::kMovingVar].data(); + const NDArray &out = out_data[batchnorm::kOut]; + auto out_mean = out_data[batchnorm::kMean].data(); + auto out_var = out_data[batchnorm::kVar].data(); + + MKLDNNBNForward &fwd = GetBNFwd(param, ctx.is_train, data); + fwd.SetDataHandle(req, data, out, moving_mean, moving_var, + out_mean, out_var, &gamma, &beta); + fwd.Execute(); } template From 6836bf149d507e4342cd9d41db8506274d597ffe Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 30 Dec 2017 19:12:08 +0000 Subject: [PATCH 237/264] Remove saving reordered mkldnn mem. --- include/mxnet/ndarray.h | 12 ----------- src/ndarray/ndarray.cc | 5 ----- src/operator/nn/mkldnn/mkldnn_base-inl.h | 2 +- src/operator/nn/mkldnn/mkldnn_base.cc | 20 +++++-------------- src/operator/nn/mkldnn/mkldnn_convolution.cc | 2 +- .../nn/mkldnn/mkldnn_deconvolution.cc | 2 +- 6 files changed, 8 insertions(+), 35 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 8dc757bb9f03..f400ef37eb1f 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -558,13 +558,6 @@ class NDArray { #if MXNET_USE_MKLDNN == 1 bool IsMKLDNNDefault() const; - void SaveMKLDNNReorder(std::shared_ptr reorder) const; - const mkldnn::memory *GetMKLDNNReorder() const { - if (ptr_ != nullptr) - return ptr_->Mkl_reorder_.get(); - else - return nullptr; - } /* * All functions below return a raw pointer to mkldnn memory. Actually there * is a shared pointer that hold the memory either in NDArray or in MKLDNN @@ -640,11 +633,6 @@ class NDArray { /*! This is created when data is stored in MKLDNN format. */ std::shared_ptr Mkl_mem_; - /* - * This contains a copy of the original data. However, the data in this - * member may be out of date. TODO(zhengda) we should fix this problem. - */ - std::shared_ptr Mkl_reorder_; #endif /*! \brief variable from engine */ Engine::VarHandle var; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 1eb66e1e7774..7e7a508a152e 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -441,11 +441,6 @@ bool NDArray::IsMKLDNNDefault() const { } } -void NDArray::SaveMKLDNNReorder(std::shared_ptr reorder) const { - if (ptr_) - ptr_->Mkl_reorder_ = reorder; -} - void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 480537d50e7c..734305ddc664 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -377,7 +377,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups, bool save_reorder = false); + int num_groups); const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::engine &engine, int num_groups = 1); diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 723c359b650d..1cf538f5a86e 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -80,11 +80,8 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, - int num_groups, bool save_reorder) { - const mkldnn::memory *mem = arr.GetMKLDNNReorder(); - if (mem != nullptr) - return mem; - + int num_groups) { + const mkldnn::memory *mem; mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Get()->get_engine(); if (arr.shape().ndim() == 2) { @@ -121,16 +118,9 @@ const mkldnn::memory *GetWeights(const NDArray &arr, } if (mem->get_primitive_desc() == target_pd) return mem; - if (save_reorder) { - std::shared_ptr ret(new mkldnn::memory(target_pd)); - MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); - arr.SaveMKLDNNReorder(ret); - return ret.get(); - } else { - auto ret = TmpMemMgr::Get()->Alloc(target_pd); - MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); - return ret; - } + auto ret = TmpMemMgr::Get()->Alloc(target_pd); + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; } const mkldnn::memory *GetWeights(const NDArray &arr, diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index c3fdc778724f..b5f54b144b9c 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -268,7 +268,7 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); auto weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), - param.num_group, !ctx.is_train); + param.num_group); auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(), req[conv::kOut]); const mkldnn::memory *bias_mem = nullptr; diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 6435fa868bce..eda28e3d8cff 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -164,7 +164,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &c auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( deconvFwd_pd.diff_dst_primitive_desc()); auto weight_mem = GetWeights(in_data[deconv::kWeight], - deconvFwd_pd.weights_primitive_desc(), param.num_group, !ctx.is_train); + deconvFwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); From 47d9f92d2199e9a9b3ce1cb17917be9c0d7b7021 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 31 Dec 2017 08:44:34 +0000 Subject: [PATCH 238/264] Fix a bug in concat. --- src/operator/nn/concat.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index 52ccd234db0f..b3de84dbc7ea 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -158,10 +158,9 @@ void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, if (req[0] == kNullOp) return; #if MXNET_USE_MKLDNN == 1 // MKLDNN support 2D and 4D concat - if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { - if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs); - } + if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) + && inputs[0].dtype() == mshadow::kFloat32) { + MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs); } else { std::vector in_blobs(inputs.size()); for (size_t i = 0; i < in_blobs.size(); i++) @@ -178,10 +177,9 @@ static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { #if MXNET_USE_MKLDNN == 1 - if (inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) { - if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs); - } + if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) + && inputs[0].dtype() == mshadow::kFloat32) { + MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs); } else { std::vector in_blobs(1); in_blobs[0] = inputs[0].data(); From a18b3de9de360cbdbeeef8b6481e80ccc2dbfd5e Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 31 Dec 2017 03:48:41 +0000 Subject: [PATCH 239/264] Remove MKLDNNStorage. --- include/mxnet/ndarray.h | 18 +- src/common/exec_utils.h | 2 +- src/executor/graph_executor.cc | 8 +- src/imperative/imperative_utils.h | 5 +- src/ndarray/ndarray.cc | 310 +++++++----------- src/operator/nn/activation.cc | 27 +- src/operator/nn/batch_norm.cc | 32 +- src/operator/nn/concat.cc | 18 +- src/operator/nn/convolution.cc | 24 +- src/operator/nn/deconvolution.cc | 23 +- src/operator/nn/fully_connected.cc | 23 +- src/operator/nn/lrn.cc | 16 +- src/operator/nn/pooling.cc | 10 +- src/operator/nn/softmax.cc | 5 +- src/operator/tensor/cast_storage-inl.h | 11 - src/operator/tensor/cast_storage.cc | 9 - .../tensor/elemwise_binary_op_basic.cc | 32 +- .../tensor/elemwise_binary_scalar_op_basic.cc | 10 - src/operator/tensor/elemwise_sum.cc | 26 +- .../tensor/elemwise_unary_op_basic.cc | 17 +- src/operator/tensor/matrix_op.cc | 15 +- 21 files changed, 249 insertions(+), 392 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index f400ef37eb1f..c473cb70f15d 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -139,10 +139,13 @@ class NDArray { dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { } - inline bool is_view() const { + inline bool IsView() const { // Sparse arrays don't have a view. if (storage_type() == kRowSparseStorage || storage_type() == kCSRStorage) return false; + // If the array reuses memory, it's not a view. + if (reuse_) + return false; return byte_offset_ > 0 || shape() != ptr_->storage_shape; } @@ -484,9 +487,14 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; + // TODO we'll fix it later. + CHECK(!IsMKLDNN()); + // We can't reuse memory in a view. + CHECK(!IsView()); NDArray ret = *this; ret.shape_ = shape; ret.dtype_ = dtype; + ret.reuse_ = true; return ret; } /*! @@ -557,7 +565,8 @@ class NDArray { } #if MXNET_USE_MKLDNN == 1 - bool IsMKLDNNDefault() const; + bool IsMKLDNN() const; + bool IsDefault() const; /* * All functions below return a raw pointer to mkldnn memory. Actually there * is a shared pointer that hold the memory either in NDArray or in MKLDNN @@ -818,6 +827,9 @@ class NDArray { // Have MKL memory reference to the data in the default storage // or create memory for MKLDNN. void SetMKLMem(const TShape &shape, int dtype); + // In the data is stored in MKLDNN layout, we reorder data in Mkl_mem_ and + // save the result in shandle. + void Reorder2Default(); #endif // create storage handle for aux data based on shape @@ -858,6 +870,8 @@ class NDArray { size_t byte_offset_ = 0; /*! \brief type of data */ int dtype_ = -1; + /*! \brief whether the NDArray uses memory of another NDArray. */ + bool reuse_ = false; /*! \brief storage type of data */ NDArrayStorageType storage_type_ = kUndefinedStorage; /*! \brief node entry for autograd */ diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 4b90dd81b157..ae54946999ef 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -54,7 +54,7 @@ inline bool SetupDefaultBlobs(const std::vector& src, bool is_default = nd.storage_type() == kDefaultStorage; #if MXNET_USE_MKLDNN == 1 // If this is mkldnn storage and it uses the default layout. - is_default = is_default || nd.IsMKLDNNDefault(); + is_default = nd.IsDefault(); #endif if (!is_default) { if (idx_map != nullptr) { diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index a39fddc97a36..65a101dcbebe 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1220,10 +1220,14 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { if (storage_type == kDefaultStorage) { CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; const NDArray& src = data_pool_.at(storage_id); - data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + // TODO this is a temp fix. + if (src.IsMKLDNN()) + data_entry_[i] = NDArray(vshape[i], data_context[i], true, vdtype[i]); + else + data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], - true, vdtype[i]); + true, vdtype[i]); } if (log_verbose_) { LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type); diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 528cd06c4bee..218914c2891f 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -739,11 +739,14 @@ inline void AllocateMemory(const nnvm::Graph& g, NDArray buff(TShape({static_cast(mem_plan[i].size)}), default_ctx, true, mshadow::kUint8); *arrays[i] = buff.AsArray(shapes[i], dtypes[i]); - } else { + } else if (!arrays[mem_plan[i].sid]->IsMKLDNN()) { + // TODO this is a temp fix. *arrays[i] = arrays[mem_plan[i].sid]->AsArray(shapes[i], dtypes[i]); if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) { array_reqs->at(i) = kWriteInplace; } + } else { + *arrays[i] = NDArray(shapes[i], default_ctx, true, dtypes[i]); } } else { *arrays[i] = NDArray(static_cast(stypes[i]), diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 7e7a508a152e..bebcf866482c 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -50,28 +50,12 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { -static inline NDArrayStorageType DetermineSType(NDArrayStorageType stype, - int dtype, const TShape &shape) { -#if MXNET_USE_MKLDNN == 1 - // We can't always generate a MKLDNN storage. If MKLDNN can't support - // the data type, we'll have to fall back to the default storage. - if (stype == kMKLDNNStorage && !SupportMKLDNNArray(dtype, shape)) - return kDefaultStorage; - else -#endif - return stype; -} - -NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context ctx, +NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, bool delay_alloc, int dtype, std::vector aux_types, std::vector aux_shapes, TShape storage_shape) : shape_(shape), - dtype_(dtype), storage_type_(DetermineSType(_stype, dtype, shape)), entry_({nullptr, 0, 0}) { - NDArrayStorageType stype = DetermineSType(_stype, dtype, shape); + dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { // Assign default aux types if not given if (aux_types.size() == 0 -#if MXNET_USE_MKLDNN == 1 - && stype != kMKLDNNStorage -#endif && stype != kDefaultStorage) { if (stype == kRowSparseStorage) { aux_types = {mshadow::kInt64}; @@ -84,9 +68,6 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c // Assign default shapes if not given // unknown shapes are intialized as {0} such that Size() would return 0 if (aux_shapes.size() == 0 -#if MXNET_USE_MKLDNN == 1 - && stype != kMKLDNNStorage -#endif && stype != kDefaultStorage) { if (stype == kRowSparseStorage) { aux_shapes = {TShape(mshadow::Shape1(0))}; @@ -104,10 +85,6 @@ NDArray::NDArray(const NDArrayStorageType _stype, const TShape &shape, Context c storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; } else if (stype == kCSRStorage) { storage_shape = aux_shapes[csr::kIdx]; -#if MXNET_USE_MKLDNN == 1 - } else if (stype == kMKLDNNStorage) { - storage_shape = shape; -#endif } else { LOG(FATAL) << "Unknown storage type " << stype; } @@ -147,23 +124,15 @@ NDArray::Chunk::~Chunk() { } void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { -#if MXNET_USE_MKLDNN == 1 - if (storage_type == kMKLDNNStorage) { - SetMKLMem(shape, dtype); - } else { -#endif - CHECK_NE(aux_shapes.size(), 0) - << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); - } -#if MXNET_USE_MKLDNN == 1 + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); } -#endif // init shape storage_shape = shape; // delay_alloc is only set when data storage handle is present @@ -190,23 +159,6 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { #if MXNET_USE_MKLDNN == 1 -static inline mkldnn_mem_ptr Reorder2Default(mkldnn_mem_ptr mem, - bool submit_now = true) { - auto format = GetDefaultFormat(mem->get_primitive_desc().desc()); - if (format == mem->get_primitive_desc().desc().data.format) - return mem; - - auto def_pd = GetPrimitiveDesc(mem->get_primitive_desc(), format); - mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterMem(mem); - stream->RegisterMem(def_mem); - stream->RegisterPrim(mkldnn::reorder(*mem, *def_mem)); - if (submit_now) - stream->Submit(); - return def_mem; -} - struct EmptyMKLDNNDeleter { void operator()(mkldnn::memory *mem) { } @@ -216,36 +168,31 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; - if (storage_type() == kDefaultStorage) { + CHECK_EQ(storage_type(), kDefaultStorage); + if (!IsMKLDNN()) { NDArray ret = this->Detach(); ret.shape_ = shape; return ret; - } else if (storage_type() == kMKLDNNStorage) { - NDArray ret(kMKLDNNStorage, shape, ctx(), true, dtype()); - CHECK(ptr_->Mkl_mem_ != nullptr); + } else { + NDArray ret(shape, ctx(), true, dtype()); // We shouldn't submit the reorder primitive here because submit will // be called in operators. auto format = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc()); - if (format == ptr_->Mkl_mem_->get_primitive_desc().desc().data.format) { - ret.ptr_->Mkl_mem_ = ptr_->Mkl_mem_; - } else { - auto def_pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), format); - auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterMem(ptr_->Mkl_mem_); - stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *def_mem)); - // def_mem points to a memory region in the temp space. It's only valid - // inside an operator. As such, the returned NDArray can only be valid - // inside an operator and the shared point doesn't need to do anything - // when it's destroyed. - ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, - EmptyMKLDNNDeleter()); - } + CHECK_NE(format, ptr_->Mkl_mem_->get_primitive_desc().desc().data.format); + auto def_pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), format); + auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterMem(ptr_->Mkl_mem_); + stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *def_mem)); + // def_mem points to a memory region in the temp space. It's only valid + // inside an operator. As such, the returned NDArray can only be valid + // inside an operator and the shared point doesn't need to do anything + // when it's destroyed. + ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, + EmptyMKLDNNDeleter()); ret.byte_offset_ = byte_offset_; return ret; } - LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; - return NDArray(); } #endif @@ -254,30 +201,10 @@ NDArray NDArray::Reshape(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; - if (storage_type() == kDefaultStorage) { - NDArray ret = this->Detach(); - ret.shape_ = shape; - return ret; -#if MXNET_USE_MKLDNN == 1 - } else if (storage_type() == kMKLDNNStorage) { - NDArray ret = this->Detach(); - ret.shape_ = shape; - // We need to convert the MKL memory to the default layout. - Engine::Get()->PushSync([&](RunContext ctx) { - if (this->ptr_->Mkl_mem_) { - auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); - if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { - ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } - } - }, ctx(), {}, {ret.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); - ret.WaitToRead(); - return ret; -#endif - } - LOG(FATAL) << "Reshape for storage type " << storage_type() << " is not implemented yet"; - return NDArray(); + CHECK_EQ(storage_type(), kDefaultStorage); + NDArray ret = this->Detach(); + ret.shape_ = shape; + return ret; } NDArray NDArray::ReshapeWithRecord(const TShape &shape) { @@ -303,28 +230,6 @@ NDArray NDArray::Slice(index_t begin, index_t end) const { CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; -#if MXNET_USE_MKLDNN == 1 - CHECK(storage_type() == kDefaultStorage || storage_type() == kMKLDNNStorage); - if (storage_type() == kMKLDNNStorage) { - NDArray ret = this->Detach(); - ret.shape_[0] = end - begin; - size_t length = shape_.ProdShape(1, shape_.ndim()); - MSHADOW_TYPE_SWITCH(ret.dtype(), DType, { - ret.byte_offset_ += begin * length * sizeof(DType); - }); - - // We need to convert the MKL memory to the default layout. - Engine::Get()->PushSync([&](RunContext ctx) { - auto def_format = GetDefaultFormat(this->ptr_->Mkl_mem_->get_primitive_desc().desc()); - if (this->ptr_->Mkl_mem_->get_primitive_desc().desc().data.format != def_format) { - ret.ptr_->Mkl_mem_ = Reorder2Default(this->ptr_->Mkl_mem_); - } - }, ctx(), {}, {ret.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncMKLDNN2Default")); - ret.WaitToRead(); - return ret; - } -#endif CHECK_EQ(storage_type(), kDefaultStorage); NDArray ret = this->Detach(); size_t length = shape_.ProdShape(1, shape_.ndim()); @@ -351,12 +256,7 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) { } NDArray NDArray::At(index_t idx) const { -#if MXNET_USE_MKLDNN == 1 - CHECK(storage_type() == kDefaultStorage - || storage_type() == kMKLDNNStorage) -#else CHECK(storage_type() == kDefaultStorage) -#endif << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { @@ -367,12 +267,7 @@ NDArray NDArray::At(index_t idx) const { } NDArray NDArray::AtWithRecord(index_t idx) { -#if MXNET_USE_MKLDNN == 1 - CHECK(storage_type() == kDefaultStorage - || storage_type() == kMKLDNNStorage) -#else CHECK(storage_type() == kDefaultStorage) -#endif << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->SliceWithRecord(idx, idx+1); if (shape_.ndim() > 1) { @@ -430,26 +325,56 @@ static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::de && get_mkldnn_type(dtype) == desc.data.data_type; } -bool NDArray::IsMKLDNNDefault() const { +bool NDArray::IsMKLDNN() const { + // When MKLDNN is enabled, data can be stored in two locations in Chunk: + // shandle or Mkl_mem_. When the data is stored in the default layout, + // the memory should be held by shandle, and Mkl_mem_ references to the + // memory. When the data is stored in special MKLDNN layout, the memory should + // be held by Mkl_mem_. TODO eventually, we want shandle to hold data for both + // cases. + return ptr_->Mkl_mem_ != nullptr + && ptr_->Mkl_mem_->get_data_handle() != ptr_->shandle.dptr; +} + +bool NDArray::IsDefault() const { + if (storage_type() != kDefaultStorage) + return false; // If we don't have mkldnn memory yet, we just assume it's not the default // format. - if (storage_type() == kMKLDNNStorage && ptr_->Mkl_mem_ != nullptr) { + if (ptr_->Mkl_mem_ == nullptr) + return true; + if (ptr_->Mkl_mem_->get_data_handle() == ptr_->shandle.dptr) { auto desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); - return desc.data.format == GetDefaultFormat(desc); + CHECK(desc.data.format == GetDefaultFormat(desc)); + return true; } else { return false; } } +void NDArray::Chunk::Reorder2Default() { + if (Mkl_mem_ == nullptr) + return; + + auto format = GetDefaultFormat(Mkl_mem_->get_primitive_desc().desc()); + CHECK(format != Mkl_mem_->get_primitive_desc().desc().data.format); + + CHECK(shandle.dptr == nullptr); + CheckAndAlloc(); + auto def_pd = GetPrimitiveDesc(Mkl_mem_->get_primitive_desc(), format); + mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd, shandle.dptr)); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::reorder(*Mkl_mem_, *def_mem)); + stream->Submit(); + Mkl_mem_ = nullptr; +} + void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data // in 5 dimensions while the NDArray stores data in 4 dimensions. - // TODO(zhengda) is it possible that the MKL memory is out-of-date? - if (Mkl_mem_ && storage_type == kMKLDNNStorage) { - return; - } else if (Mkl_mem_ && Mkl_mem_->get_data_handle() == shandle.dptr - && same_shape(shape, dtype, Mkl_mem_->get_primitive_desc().desc())) { + if (Mkl_mem_ && Mkl_mem_->get_data_handle() == shandle.dptr + && same_shape(shape, dtype, Mkl_mem_->get_primitive_desc().desc())) { return; } @@ -481,18 +406,10 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { } mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; auto cpu_engine = CpuEngine::Get()->get_engine(); - // If the storage type is the default type, we can just simply - // reference to the memory for the default storage. - if (storage_type == kDefaultStorage) { - if (shandle.dptr == nullptr) - CheckAndAlloc(); - Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, - cpu_engine), shandle.dptr)); - } else if (storage_type == kMKLDNNStorage) { - // If the array uses MKLDNN storage, we need to allocate memory here. - Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc(data_md, - cpu_engine))); - } + if (shandle.dptr == nullptr) + CheckAndAlloc(); + Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc( + data_md, cpu_engine), shandle.dptr)); } /* @@ -540,7 +457,7 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } - CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); + CHECK(storage_type() == kDefaultStorage); auto mem = GetMKLDNNData(); // If the memory descriptor matches, it's easy. @@ -567,12 +484,19 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( } const mkldnn::memory *NDArray::GetMKLDNNData() const { - CHECK(storage_type() == kMKLDNNStorage || storage_type() == kDefaultStorage); - ptr_->SetMKLMem(shape_, dtype_); - CHECK(ptr_->Mkl_mem_ != nullptr); + CHECK(storage_type() == kDefaultStorage); + // If this array uses MKLDNN layout and it's a view, we have to change its + // layout to the default layout. + if (IsMKLDNN() && IsView()) + ptr_->Reorder2Default(); + ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, + dtype_); + // If shandle has data, the data in shandle and Mkl_mem_ should match. + if (ptr_->shandle.dptr) + CHECK(ptr_->shandle.dptr == ptr_->Mkl_mem_->get_data_handle()); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); auto pd = ptr_->Mkl_mem_->get_primitive_desc(); - if (is_view()) { + if (IsView()) { // Sliced array must use the default layout. CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); } @@ -615,12 +539,17 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) { } MKLDNNStream *stream = MKLDNNStream::Get(); - ptr_->SetMKLMem(shape_, dtype_); + // If this array uses MKLDNN layout and it's a view, we have to change its + // layout to the default layout. + if (IsMKLDNN() && IsView()) + ptr_->Reorder2Default(); + ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, + dtype_); stream->RegisterMem(ptr_->Mkl_mem_); auto from_desc = mem.get_primitive_desc().desc(); auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); auto from_def_format = GetDefaultFormat(from_desc); - if (is_view()) { + if (IsView()) { // Sliced array must use the default layout. CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format); } @@ -688,9 +617,11 @@ mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc p mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { mkldnn::memory::primitive_desc _desc = desc; + // This array shouldn't be a view. + CHECK(!IsView()); auto required_format = _desc.desc().data.format; auto def_format = GetDefaultFormat(_desc.desc()); - if (storage_type() != kMKLDNNStorage && required_format != def_format) + if (required_format != def_format) return nullptr; if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { @@ -711,7 +642,7 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc & return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } - ptr_->Mkl_mem_ = mkldnn_mem_ptr(new mkldnn::memory(desc)); + ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); } @@ -723,19 +654,16 @@ void NDArray::SetTBlob() const { char *dptr = static_cast(ptr_->shandle.dptr); auto stype = storage_type(); if (stype == kDefaultStorage) { +#if MXNET_USE_MKLDNN == 1 + if (IsMKLDNN()) { + ptr_->Reorder2Default(); + dptr = static_cast(ptr_->shandle.dptr); + } +#endif dptr += byte_offset_; } else if (stype == kCSRStorage || stype == kRowSparseStorage) { CHECK_EQ(byte_offset_, 0); shape = storage_shape(); -#if MXNET_USE_MKLDNN == 1 - } else if (stype == kMKLDNNStorage) { - if (ptr_->Mkl_mem_) - ptr_->Mkl_mem_ = Reorder2Default(ptr_->Mkl_mem_); - else - ptr_->SetMKLMem(shape_, dtype_); - dptr = static_cast(ptr_->Mkl_mem_->get_data_handle()); - dptr += byte_offset_; -#endif } else { LOG(FATAL) << "unknown storage type " << stype; } @@ -1012,22 +940,26 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext // Make a copy of a dense NDArray template inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) { - using namespace mshadow; - CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; - TBlob tmp = to.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), to.ctx(), ctx); -} - #if MXNET_USE_MKLDNN == 1 -inline void CopyFromToMKLDNNImpl(const NDArray& from, const NDArray& to, RunContext ctx) { - auto from_mem = from.GetMKLDNNData(); - auto to_mem = to.GetMKLDNNData(); - size_t size = std::min(from_mem->get_primitive_desc().get_size(), - to_mem->get_primitive_desc().get_size()); - memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); -} + // If neither is MKLDNN, we can copy data normally. + if (!from.IsMKLDNN() && !to.IsMKLDNN()) { +#endif + using namespace mshadow; + CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; + TBlob tmp = to.data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to.ctx(), ctx); +#if MXNET_USE_MKLDNN == 1 + } else { + auto from_mem = from.GetMKLDNNData(); + auto to_mem = to.GetMKLDNNData(); + CHECK(from_mem->get_primitive_desc() == to_mem->get_primitive_desc()); + size_t size = std::min(from_mem->get_primitive_desc().get_size(), + to_mem->get_primitive_desc().get_size()); + memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); + } #endif +} // Make a copy of an NDArray based on storage type template @@ -1075,10 +1007,6 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to, CopyFromToRspImpl(casted_nd, to, rctx); } else if (to_stype == kCSRStorage) { CopyFromToCsrImpl(casted_nd, to, rctx); -#if MXNET_USE_MKLDNN == 1 - } else if (to_stype == kMKLDNNStorage) { - CopyFromToMKLDNNImpl(casted_nd, to, rctx); -#endif } else { LOG(FATAL) << "unknown storage type" << to_stype; } diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index bed54ea4cdd1..06cc2e7b5819 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -97,15 +97,15 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->size(), 1); CHECK_EQ(out_attrs->size(), 1); const ActivationParam& param = nnvm::get(attrs.parsed); + bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; } #endif - return ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, in_attrs, out_attrs); + return ret; } inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, @@ -120,20 +120,21 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, #endif CHECK_EQ(out_attrs->size(), 1U); const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_CUDNN == 1 + bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); +#else + bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); +#endif #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; } #endif -#if MXNET_USE_CUDNN == 1 - return ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, in_attrs, out_attrs); -#else - return ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, in_attrs, out_attrs); -#endif + return ret; } MXNET_OPERATOR_REGISTER_UNARY(Activation) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 00857e8cbb00..e64f4ede4a01 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -402,7 +402,7 @@ static inline bool similar_array(const mxnet::NDArray &arr1, #if MXNET_USE_MKLDNN == 1 static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { TShape shape = input.shape(); - bool support = input.storage_type() == kMKLDNNStorage && shape.ndim() == 4 + bool support = SupportMKLDNN(input) && shape.ndim() == 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS && shape[param.axis] % 8 == 0; if (support) { @@ -462,7 +462,7 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, TShape shape = inputs[0].shape(); #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNNBN(inputs[0], param) - && inputs[in_data_start].storage_type() == kMKLDNNStorage) { + && inputs[in_data_start].IsMKLDNN()) { std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); std::vector in_data(inputs.begin() + in_data_start, inputs.begin() + aux_states_start); @@ -499,18 +499,11 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), 5); CHECK_EQ(out_attrs->size(), 3); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - for (int& v : *in_attrs) { - if (v == kUndefinedStorage) v = kDefaultStorage; - } - (*out_attrs)[0] = kMKLDNNStorage; - (*out_attrs)[1] = kDefaultStorage; - (*out_attrs)[2] = kDefaultStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (int& v : *in_attrs) { if (v == - 1) v = kDefaultStorage; } @@ -528,20 +521,11 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), 11); CHECK_EQ(out_attrs->size(), 5); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - for (int& v : *in_attrs) { - if (v == kUndefinedStorage) v = kDefaultStorage; - } - (*out_attrs)[0] = kMKLDNNStorage; - (*out_attrs)[1] = kDefaultStorage; - (*out_attrs)[2] = kDefaultStorage; - (*out_attrs)[3] = kDefaultStorage; - (*out_attrs)[4] = kDefaultStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (int& v : *in_attrs) { if (v == - 1) v = kDefaultStorage; } diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc index b3de84dbc7ea..ec89a3aae974 100644 --- a/src/operator/nn/concat.cc +++ b/src/operator/nn/concat.cc @@ -114,14 +114,11 @@ inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1U); #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask - // There must be at least one array that are in MKLDNN format. - && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; } @@ -134,14 +131,11 @@ inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs, #if MXNET_USE_MKLDNN == 1 CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); if (dev_mask == mshadow::cpu::kDevMask - && in_attrs->at(0) == kMKLDNNStorage) { + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index ad79c01ffbb0..d5781c063f3b 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -303,13 +303,11 @@ inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; } @@ -326,21 +324,11 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[conv::kData] = kMKLDNNStorage; - // We don't want the parameter gradients are stored in MKLDNN storage. - // These will be sent to the KVstore to update the global parameters. - // We should convert storage inside an operator so that we can take - // advantage of TempSpace. - (*out_attrs)[conv::kWeight] = kDefaultStorage; - if (!param.no_bias) - (*out_attrs)[conv::kBias] = kDefaultStorage; - for (size_t i = 0; i < out_attrs->size(); i++) - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 71d0139eee6e..f4cba13cb147 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -267,13 +267,11 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; } @@ -289,20 +287,11 @@ inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[deconv::kData] = kMKLDNNStorage; - // We don't want the parameter gradients are stored in MKLDNN storage. - // These will be sent to the KVstore to update the global parameters. - // We should convert storage inside an operator so that we can take - // advantage of TempSpace. - (*out_attrs)[deconv::kWeight] = kDefaultStorage; - if (!param.no_bias) - (*out_attrs)[deconv::kBias] = kDefaultStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 3723a03af8d0..31098413929b 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -138,13 +138,11 @@ inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; (*out_attrs)[0] = kDefaultStorage; return true; } @@ -160,20 +158,11 @@ inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), out_expected); #if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[fullc::kData] = kMKLDNNStorage; - // We don't want the parameter gradients are stored in MKLDNN storage. - // These will be sent to the KVstore to update the global parameters. - // We should convert storage inside an operator so that we can take - // advantage of TempSpace. - (*out_attrs)[fullc::kWeight] = kDefaultStorage; - if (!param.no_bias) - (*out_attrs)[fullc::kBias] = kDefaultStorage; - return true; - } + else #endif - *dispatch_mode = DispatchMode::kFCompute; + *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc index 7073054532e8..330a4550f04f 100644 --- a/src/operator/nn/lrn.cc +++ b/src/operator/nn/lrn.cc @@ -85,19 +85,15 @@ inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, DispatchMode* dispatch_mode, std::vector *in_attrs, std::vector *out_attrs) { + *dispatch_mode = DispatchMode::kFCompute; #if MXNET_USE_MKLDNN == 1 CHECK(!in_attrs->empty()); if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; - return true; } #endif - *dispatch_mode = DispatchMode::kFCompute; - for (size_t i = 0; i < out_attrs->size(); i++) { + for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; - } return true; } @@ -106,19 +102,15 @@ inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, DispatchMode* dispatch_mode, std::vector *in_attrs, std::vector *out_attrs) { + *dispatch_mode = DispatchMode::kFCompute; #if MXNET_USE_MKLDNN == 1 CHECK(!in_attrs->empty()); if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; - return true; } #endif - *dispatch_mode = DispatchMode::kFCompute; - for (size_t i = 0; i < out_attrs->size(); i++) { + for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; - } return true; } diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 62717bf01747..f757473f3c96 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -304,18 +304,15 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 1); + *dispatch_mode = DispatchMode::kFCompute; #if MXNET_USE_MKLDNN == 1 const PoolingParam ¶m = nnvm::get(attrs.parsed); if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; - return true; } #else CHECK_EQ(out_attrs->size(), 1); #endif - *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; @@ -330,17 +327,14 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs, CHECK_EQ(in_attrs->size(), GetNumBackInputs(param)); CHECK_EQ(out_attrs->size(), 1); + *dispatch_mode = DispatchMode::kFCompute; #if MXNET_USE_MKLDNN == 1 if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { *dispatch_mode = DispatchMode::kFComputeEx; - for (size_t i = 0; i < out_attrs->size(); i++) - (*out_attrs)[i] = kMKLDNNStorage; - return true; } #else CHECK_EQ(in_attrs->size(), 3); #endif - *dispatch_mode = DispatchMode::kFCompute; for (size_t i = 0; i < out_attrs->size(); i++) (*out_attrs)[i] = kDefaultStorage; return true; diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 86762adc9a92..5a2071a1a53f 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -62,9 +62,8 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 1); #if MXNET_USE_MKLDNN == 1 - // We only run MKLDNN op if it runs on CPU and the input data is MKLDNN - // format. - if (dev_mask == mshadow::cpu::kDevMask && (*in_attrs)[0] == kMKLDNNStorage) + // We only run MKLDNN op if it runs on CPU. + if (dev_mask == mshadow::cpu::kDevMask) *dispatch_mode = DispatchMode::kFComputeEx; else #endif diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index e113ee1befa4..6d540b1b3c03 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -324,11 +324,6 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, }); } -#if MXNET_USE_MKLDNN == 1 -void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); -void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dns); -#endif - template void CastStorageComputeImpl(const OpContext& ctx, const NDArray& input, @@ -347,12 +342,6 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); -#if MXNET_USE_MKLDNN == 1 - } else if (src_stype == kMKLDNNStorage && dst_stype == kDefaultStorage) { - CastStorageMKLDnsImpl(ctx, input, output); - } else if (src_stype == kDefaultStorage && dst_stype == kMKLDNNStorage) { - CastStorageDnsMKLImpl(ctx, input, output); -#endif } else { LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; } diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc index 80d123955369..81abcc7dc955 100644 --- a/src/operator/tensor/cast_storage.cc +++ b/src/operator/tensor/cast_storage.cc @@ -67,15 +67,6 @@ void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArr } } -void CastStorageDnsMKLImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst) { - CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU); - CHECK(dst.shape() == src.shape()); - CHECK_EQ(dst.dtype(), src.dtype()); - - std::vector net; - net.push_back(mkldnn::reorder(*src.GetMKLDNNData(), *dst.GetMKLDNNData())); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); -} #endif DMLC_REGISTER_PARAMETER(CastStorageParam); diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index c36225078275..93b8d4687453 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -42,7 +42,7 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); return; } else if (inputs[0].storage_type() == kDefaultStorage - || inputs[1].storage_type() == kDefaultStorage) { + && inputs[1].storage_type() == kDefaultStorage) { // This happens if inputs are supposed to be in MKLDNN format // but MKLDNN doesn't support the data type or the shape. We're // forced to convert it to the default format. @@ -67,22 +67,16 @@ static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 2); CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 - // If both inputs can be used by MKLDNN, we want to use MKLDNN. - auto support_mkldnn = SupportStorageMKLDNN(in_attrs->at(0)) - && SupportStorageMKLDNN(in_attrs->at(1)); - if (support_mkldnn && dev_mask == mshadow::cpu::kDevMask) { - // However, we only want the output uses mkldnn storage if one of the inputs - // is in mkldnn storage. - auto has_mkldnn = in_attrs->at(0) == kMKLDNNStorage - || in_attrs->at(1) == kMKLDNNStorage; - out_attrs->at(0) = has_mkldnn ? kMKLDNNStorage : kDefaultStorage; + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && out_attrs->at(0) == kDefaultStorage) { *dispatch_mode = DispatchMode::kFComputeEx; - return true; } #endif - return ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ret; } MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) @@ -117,7 +111,7 @@ static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 2U); #if MXNET_USE_MKLDNN == 1 - if (inputs[0].storage_type() == kMKLDNNStorage) { + if (inputs[0].IsMKLDNN()) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]); return; @@ -134,16 +128,14 @@ static inline bool _backward_ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 1); CHECK_EQ(out_attrs->size(), 2); + bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 - if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { - out_attrs->at(0) = kMKLDNNStorage; - out_attrs->at(1) = kMKLDNNStorage; + if (dev_mask == mshadow::cpu::kDevMask) { *dispatch_mode = DispatchMode::kFComputeEx; - return true; } #endif - return ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ret; } NNVM_REGISTER_OP(_backward_add) diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index 82e497af67dc..226a3b6fb7d4 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -53,12 +53,7 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a std::vector* in_attrs, std::vector* out_attrs) { bool dispatched = false; -#if MXNET_USE_MKLDNN == 1 - if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage, - kMKLDNNStorage, nullptr)) { -#else if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { -#endif dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage, dispatch_mode, @@ -86,12 +81,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs, const auto in_stype = in_attrs->at(0); auto &out_stype = out_attrs->at(0); bool dispatched = false; -#if MXNET_USE_MKLDNN == 1 - if (!dispatched && (in_stype == kDefaultStorage - || in_stype == kMKLDNNStorage)) { -#else if (!dispatched && (in_stype == kDefaultStorage)) { -#endif // dns -> dns dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 1b70bbd8f436..9beab1cd0f25 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -81,16 +81,25 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); + bool ret = ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 + // We should always use FComputeEx. if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsStorage(*in_attrs, kMKLDNNStorage)) { + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && out_attrs->at(0) == kDefaultStorage) { *dispatch_mode = DispatchMode::kFComputeEx; - (*out_attrs)[0] = kMKLDNNStorage; - return true; } #endif - return ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ret; +} + +static inline bool IsMKLDNN(const std::vector &arrs) { + for (auto &arr : arrs) { + if (!arr.IsMKLDNN()) + return false; + } + return true; } void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, @@ -102,15 +111,18 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); if (req[0] == kNullOp) return; - CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; if (inputs[0].storage_type() == kRowSparseStorage) { + CHECK_EQ(req[0], kWriteTo) + << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; mshadow::Stream* s = op_ctx.get_stream(); Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.get_ctx(), ResourceRequest(ResourceRequest::kTempSpace)); NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 - } else if (common::ContainsStorage(inputs, kMKLDNNStorage)) { + } else if (IsMKLDNN(inputs)) { + CHECK_EQ(req[0], kWriteTo) + << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; MKLDNNSumForward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index bcbe55c9406c..52ae037739f6 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -119,10 +119,10 @@ static void CopyEx(const nnvm::NodeAttrs& attrs, const auto in_stype = inputs[0].storage_type(); const auto out_stype = outputs[0].storage_type(); #if MXNET_USE_MKLDNN == 1 - if (in_stype == kMKLDNNStorage) { + if (inputs[0].IsMKLDNN()) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); return; - } else if (inputs[0].storage_type() == kDefaultStorage) { + } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) { // This happens if inputs are supposed to be in MKLDNN format // but MKLDNN doesn't support the data type or the shape. We're // forced to convert it to the default format. @@ -144,15 +144,18 @@ static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 1); CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 - if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { - out_attrs->at(0) = kMKLDNNStorage; + // We have to make sure all inputs are default layouts. Otherwise, we might + // want to fallback. + if (dev_mask == mshadow::cpu::kDevMask + && in_attrs->at(0) == kDefaultStorage + && out_attrs->at(0) == kDefaultStorage) { *dispatch_mode = DispatchMode::kFComputeEx; - return true; } #endif - return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ret; } MXNET_OPERATOR_REGISTER_UNARY(_copy) diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 9ce0a7273568..05403030056b 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -133,10 +133,10 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs, #if MXNET_USE_MKLDNN == 1 const auto in_stype = inputs[0].storage_type(); const auto out_stype = outputs[0].storage_type(); - if (in_stype == kMKLDNNStorage) { + if (inputs[0].IsMKLDNN()) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); return; - } else if (in_stype == kDefaultStorage) { + } else { // This happens if inputs are supposed to be in MKLDNN format // but MKLDNN doesn't support the data type or the shape. We're // forced to convert it to the default format. @@ -157,15 +157,16 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), 1); CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); #if MXNET_USE_MKLDNN == 1 - if (in_attrs->at(0) == kMKLDNNStorage && dev_mask == mshadow::cpu::kDevMask) { - out_attrs->at(0) = kMKLDNNStorage; + if (dev_mask == mshadow::cpu::kDevMask + && in_attrs->at(0) == kDefaultStorage + && out_attrs->at(0) == kDefaultStorage) { *dispatch_mode = DispatchMode::kFComputeEx; - return true; } #endif - return ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + return ret; } NNVM_REGISTER_OP(Flatten) From 8dba402123d8ada2c8a0d8e5255314247b49b87c Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 31 Dec 2017 19:45:40 +0000 Subject: [PATCH 240/264] Force weight grad to use default layout. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 11 ++++++++++ src/operator/nn/mkldnn/mkldnn_base.cc | 22 +++++++++++++++++++ src/operator/nn/mkldnn/mkldnn_convolution.cc | 6 ++--- .../nn/mkldnn/mkldnn_deconvolution.cc | 11 ++++++---- .../nn/mkldnn/mkldnn_fully_connected.cc | 11 ++++++---- 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 734305ddc664..4bf23a81f532 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -371,10 +371,21 @@ enum OutDataOp { typedef std::pair mkldnn_output_t; +/* + * These two functions try to create MKLDNN memory in an NDArray based on `req'. + * The difference is that the first function can create MKLDNN memory with + * special layouts in an NDArray, while the second one can only create MKLDNN + * memory with default layouts. + */ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req); +mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req); + void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); + const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, int num_groups); diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 1cf538f5a86e..9cc59952daf8 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -64,6 +64,28 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, } } +mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req) { + if (kAddTo == req) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else { + auto _desc = desc; + auto def_format = GetDefaultFormat(_desc.desc()); + mkldnn::memory *mem = nullptr; + if (def_format == _desc.desc().data.format) { + mem = const_cast(arr).CreateMKLDNNData(desc); + } + if (mem == nullptr) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + return mkldnn_output_t(OutDataOp::Noop, mem); + } + } +} + void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { if (res.first == CopyBack) { const_cast(arr).CopyFrom(*res.second); diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index b5f54b144b9c..f7521aa2ac76 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -317,9 +317,9 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct bwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = CreateMKLDNNMem(in_grad[conv::kWeight], - bwdWeights_pd.diff_weights_primitive_desc(), - req[conv::kWeight]); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), + req[conv::kWeight]); mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index eda28e3d8cff..da2536ead64a 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -166,7 +166,8 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &c auto weight_mem = GetWeights(in_data[deconv::kWeight], deconvFwd_pd.weights_primitive_desc(), param.num_group); auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], - deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); + deconvFwd_pd.diff_src_primitive_desc(), + req[deconv::kOut]); MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data( deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second)); @@ -203,7 +204,8 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext & auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], bwdData_pd.weights_primitive_desc(), param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], - bwdData_pd.dst_primitive_desc(), req[deconv::kData]); + bwdData_pd.dst_primitive_desc(), + req[deconv::kData]); MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); CommitOutput(in_grad[deconv::kData], in_grad_mem); @@ -217,8 +219,9 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext & bwdWeights_pd.src_primitive_desc()); auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( bwdWeights_pd.diff_dst_primitive_desc()); - auto in_grad_weight = CreateMKLDNNMem(in_grad[deconv::kWeight], - bwdWeights_pd.diff_weights_primitive_desc(), req[deconv::kWeight]); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), + req[deconv::kWeight]); MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); CommitOutput(in_grad[deconv::kWeight], in_grad_weight); diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index d82bc1a24c0a..451b94060a41 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -161,7 +161,8 @@ void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, ipBwdData_pd.diff_dst_primitive_desc()); auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], - ipBwdData_pd.diff_src_primitive_desc(), req[fullc::kData]); + ipBwdData_pd.diff_src_primitive_desc(), + req[fullc::kData]); MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data( ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); CommitOutput(in_grad[fullc::kData], in_grad_mem); @@ -173,15 +174,17 @@ void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, auto out_grad_mem = out_grad.GetMKLDNNDataReorder( ipBwdWeights_pd.diff_dst_primitive_desc()); auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = CreateMKLDNNMem(in_grad[fullc::kWeight], - ipBwdWeights_pd.diff_weights_primitive_desc(), req[fullc::kWeight]); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight], + ipBwdWeights_pd.diff_weights_primitive_desc(), + req[fullc::kWeight]); mkldnn_output_t in_grad_bias; if (param.no_bias) { MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); } else { in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], - ipBwdWeights_pd.diff_bias_primitive_desc(), req[fullc::kBias]); + ipBwdWeights_pd.diff_bias_primitive_desc(), + req[fullc::kBias]); MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, *in_grad_bias.second)); From bcd3253ed20d2bdb3c8bc0fcc154021196341770 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 31 Dec 2017 21:22:47 +0000 Subject: [PATCH 241/264] Reorder weight arrays in (de)conv for faster inference. --- include/mxnet/ndarray.h | 15 ++++- src/ndarray/ndarray.cc | 61 ++++++++++++++++--- src/operator/nn/mkldnn/mkldnn_convolution.cc | 13 +++- .../nn/mkldnn/mkldnn_deconvolution.cc | 17 +++++- 4 files changed, 89 insertions(+), 17 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index c473cb70f15d..3c75c3962071 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -565,8 +565,12 @@ class NDArray { } #if MXNET_USE_MKLDNN == 1 - bool IsMKLDNN() const; - bool IsDefault() const; + bool IsMKLDNN() const { + return ptr_->IsMKLDNN(); + } + bool IsDefault() const { + return ptr_->IsDefault(); + } /* * All functions below return a raw pointer to mkldnn memory. Actually there * is a shared pointer that hold the memory either in NDArray or in MKLDNN @@ -596,6 +600,11 @@ class NDArray { mkldnn::memory *CreateMKLDNNData( const mkldnn::memory::primitive_desc &desc); + /* + * Reorder the memory to the specified layout. + */ + void Reorder(const mkldnn::memory::primitive_desc &desc); + /* * This function is used inside operators to reshape an array. * It's used by FullyConnected right now. @@ -830,6 +839,8 @@ class NDArray { // In the data is stored in MKLDNN layout, we reorder data in Mkl_mem_ and // save the result in shandle. void Reorder2Default(); + bool IsMKLDNN() const; + bool IsDefault() const; #endif // create storage handle for aux data based on shape diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index bebcf866482c..c23d14b4d4e9 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -325,26 +325,25 @@ static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::de && get_mkldnn_type(dtype) == desc.data.data_type; } -bool NDArray::IsMKLDNN() const { +bool NDArray::Chunk::IsMKLDNN() const { // When MKLDNN is enabled, data can be stored in two locations in Chunk: // shandle or Mkl_mem_. When the data is stored in the default layout, // the memory should be held by shandle, and Mkl_mem_ references to the // memory. When the data is stored in special MKLDNN layout, the memory should // be held by Mkl_mem_. TODO eventually, we want shandle to hold data for both // cases. - return ptr_->Mkl_mem_ != nullptr - && ptr_->Mkl_mem_->get_data_handle() != ptr_->shandle.dptr; + return Mkl_mem_ != nullptr && Mkl_mem_->get_data_handle() != shandle.dptr; } -bool NDArray::IsDefault() const { - if (storage_type() != kDefaultStorage) +bool NDArray::Chunk::IsDefault() const { + if (storage_type != kDefaultStorage) return false; // If we don't have mkldnn memory yet, we just assume it's not the default // format. - if (ptr_->Mkl_mem_ == nullptr) + if (Mkl_mem_ == nullptr) return true; - if (ptr_->Mkl_mem_->get_data_handle() == ptr_->shandle.dptr) { - auto desc = ptr_->Mkl_mem_->get_primitive_desc().desc(); + if (Mkl_mem_->get_data_handle() == shandle.dptr) { + auto desc = Mkl_mem_->get_primitive_desc().desc(); CHECK(desc.data.format == GetDefaultFormat(desc)); return true; } else { @@ -370,6 +369,9 @@ void NDArray::Chunk::Reorder2Default() { } void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { + // In this case, data is stored in Mkl_mem_ + if (shandle.dptr == nullptr && Mkl_mem_ != nullptr) + return; // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data // in 5 dimensions while the NDArray stores data in 4 dimensions. @@ -489,8 +491,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { // layout to the default layout. if (IsMKLDNN() && IsView()) ptr_->Reorder2Default(); - ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, - dtype_); + ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_); // If shandle has data, the data in shandle and Mkl_mem_ should match. if (ptr_->shandle.dptr) CHECK(ptr_->shandle.dptr == ptr_->Mkl_mem_->get_data_handle()); @@ -525,6 +526,46 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { } } +void NDArray::Reorder(const mkldnn::memory::primitive_desc &pd) { + CHECK_EQ(storage_type(), kDefaultStorage); + // If the memory already uses the specified layout, don't do anything. + if (ptr_->Mkl_mem_ != nullptr && ptr_->Mkl_mem_->get_primitive_desc() == pd) + return; + auto _pd = pd; + auto _desc = _pd.desc(); + auto def_format = GetDefaultFormat(_desc); + // If the memory is default, don't do anything. + if (def_format == _desc.data.format && ptr_->IsDefault()) + return; + // If the specified layout is default, we should use Reorder2Default. + if (def_format == _desc.data.format) { + ptr_->Reorder2Default(); + return; + } + + std::shared_ptr new_mem(new mkldnn::memory(pd)); + ptr_->SetMKLMem(shape_, dtype_); + auto old_mem = ptr_->Mkl_mem_; + // It's possible that the specified layout has a different number of dimensions. + if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) { + // For now, we only support reorder from the default layout. + CHECK(ptr_->IsDefault()); + auto def_pd = GetPrimitiveDesc(pd, def_format); + old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle())); + } + // This may be called in MKLDNN operators. We can't use MKLDNNStream here. + std::vector net; + net.push_back(mkldnn::reorder(*old_mem, *new_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + + ptr_->Mkl_mem_ = new_mem; + // If the array stores data in the default layout, we should free the memory. + if (ptr_->shandle.dptr) { + Storage::Get()->Free(ptr_->shandle); + ptr_->shandle.dptr = nullptr; + } +} + void NDArray::CopyFrom(const mkldnn::memory &mem) { if (ptr_ == nullptr) { LOG(FATAL) << "The NDArray hasn't been initialized"; diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index f7521aa2ac76..3aed59aab4ab 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -267,8 +267,17 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); - auto weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), - param.num_group); + const mkldnn::memory *weight_mem; + if (ctx.is_train) { + weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), + param.num_group); + } else { + // For inference, we want to reorder the weight array so we don't need to + // reorder data every time. + const_cast(in_data[conv::kWeight]).Reorder( + fwd.fwd_pd.weights_primitive_desc()); + weight_mem = in_data[conv::kWeight].GetMKLDNNData(); + } auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(), req[conv::kOut]); const mkldnn::memory *bias_mem = nullptr; diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index da2536ead64a..12233b6cbce1 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -163,8 +163,18 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &c out_data[deconv::kOut]); auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( deconvFwd_pd.diff_dst_primitive_desc()); - auto weight_mem = GetWeights(in_data[deconv::kWeight], - deconvFwd_pd.weights_primitive_desc(), param.num_group); + const mkldnn::memory *weight_mem; + if (ctx.is_train) { + weight_mem = GetWeights(in_data[deconv::kWeight], + deconvFwd_pd.weights_primitive_desc(), + param.num_group); + } else { + // For inference, we want to reorder the weight array so we don't need to + // reorder data every time. + const_cast(in_data[deconv::kWeight]).Reorder( + deconvFwd_pd.weights_primitive_desc()); + weight_mem = in_data[deconv::kWeight].GetMKLDNNData(); + } auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], deconvFwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); @@ -202,7 +212,8 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext & bwdData_pd.src_primitive_desc()); if (req[deconv::kData]) { auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], - bwdData_pd.weights_primitive_desc(), param.num_group); + bwdData_pd.weights_primitive_desc(), + param.num_group); auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], bwdData_pd.dst_primitive_desc(), req[deconv::kData]); From c9945482c0699e163eaa1bd2a78e4ac301605c22 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 02:04:38 +0000 Subject: [PATCH 242/264] handle diff layouts in CopyFromToDnsImpl. --- src/ndarray/ndarray.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index c23d14b4d4e9..42ef318b22cb 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -994,10 +994,15 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext } else { auto from_mem = from.GetMKLDNNData(); auto to_mem = to.GetMKLDNNData(); - CHECK(from_mem->get_primitive_desc() == to_mem->get_primitive_desc()); - size_t size = std::min(from_mem->get_primitive_desc().get_size(), - to_mem->get_primitive_desc().get_size()); - memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); + if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) { + size_t size = std::min(from_mem->get_primitive_desc().get_size(), + to_mem->get_primitive_desc().get_size()); + memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); + } else { + std::vector net; + net.push_back(mkldnn::reorder(*from_mem, *to_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + } } #endif } From 7756b448ce42d06d86c437f316a2ed8e10234cf0 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 02:05:25 +0000 Subject: [PATCH 243/264] use NDArray::data() instead of fallback for arrays with MKLDNN layout. --- src/common/exec_utils.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index ae54946999ef..5f656d4263fe 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -52,10 +52,6 @@ inline bool SetupDefaultBlobs(const std::vector& src, for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; bool is_default = nd.storage_type() == kDefaultStorage; -#if MXNET_USE_MKLDNN == 1 - // If this is mkldnn storage and it uses the default layout. - is_default = nd.IsDefault(); -#endif if (!is_default) { if (idx_map != nullptr) { (*idx_map)[i] = temp_dst->size(); From bdbc74b1e019a81ac9ab8cef1093f07ce9623043 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sun, 31 Dec 2017 19:44:16 +0000 Subject: [PATCH 244/264] Fix a bug in NDArray. --- src/ndarray/ndarray.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 42ef318b22cb..88aab4768d01 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -190,6 +190,7 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { // when it's destroyed. ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, EmptyMKLDNNDeleter()); + ret.ptr_->delay_alloc = false; ret.byte_offset_ = byte_offset_; return ret; } @@ -359,6 +360,8 @@ void NDArray::Chunk::Reorder2Default() { CHECK(format != Mkl_mem_->get_primitive_desc().desc().data.format); CHECK(shandle.dptr == nullptr); + // CheckAndAlloc only allocate memroy if delay_alloc is true. + delay_alloc = true; CheckAndAlloc(); auto def_pd = GetPrimitiveDesc(Mkl_mem_->get_primitive_desc(), format); mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd, shandle.dptr)); @@ -408,8 +411,10 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { } mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; auto cpu_engine = CpuEngine::Get()->get_engine(); - if (shandle.dptr == nullptr) + if (shandle.dptr == nullptr) { + CHECK(delay_alloc); CheckAndAlloc(); + } Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc( data_md, cpu_engine), shandle.dptr)); } @@ -501,7 +506,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const { // Sliced array must use the default layout. CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); } - if (byte_offset_ > 0) { + if (IsView()) { void *off_addr = static_cast(ptr_->Mkl_mem_->get_data_handle()) + byte_offset_; @@ -657,19 +662,17 @@ mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc p mkldnn_memory_format_t format); mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { - mkldnn::memory::primitive_desc _desc = desc; // This array shouldn't be a view. CHECK(!IsView()); - auto required_format = _desc.desc().data.format; - auto def_format = GetDefaultFormat(_desc.desc()); - if (required_format != def_format) - return nullptr; if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } + mkldnn::memory::primitive_desc _desc = desc; + auto required_format = _desc.desc().data.format; + auto def_format = GetDefaultFormat(_desc.desc()); // If the required format is a default format, we don't need to worry about the shape. // If the shape isn't the same, it actually implicitly reshapes data. if (required_format == def_format) { @@ -684,6 +687,7 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc & } ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); + ptr_->delay_alloc = false; MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); } From 04ff46999cd6534a5929c82af15148f737612994 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 02:55:01 +0000 Subject: [PATCH 245/264] Fix a bug in Flatten. --- include/mxnet/ndarray.h | 4 ++++ src/operator/tensor/matrix_op.cc | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 3c75c3962071..d0bd11ca24f1 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -604,6 +604,10 @@ class NDArray { * Reorder the memory to the specified layout. */ void Reorder(const mkldnn::memory::primitive_desc &desc); + void Reorder2Default() { + CHECK_EQ(storage_type(), kDefaultStorage); + ptr_->Reorder2Default(); + } /* * This function is used inside operators to reshape an array. diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 05403030056b..fdada08d63a9 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -135,6 +135,10 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs, const auto out_stype = outputs[0].storage_type(); if (inputs[0].IsMKLDNN()) { MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + // If the output is a special MKLDNN layout and the number of dimensions + // is larger than 2, we should use the default layout. + if (outputs[0].IsMKLDNN() && inputs[0].shape().ndim() > 2) + const_cast(outputs[0]).Reorder2Default(); return; } else { // This happens if inputs are supposed to be in MKLDNN format From 1bf20372604b07dd4a952e92b68ea018d27974c4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 19:33:12 +0000 Subject: [PATCH 246/264] Fallback correctly. --- src/common/exec_utils.h | 4 ++++ src/operator/tensor/cast_storage-inl.h | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 5f656d4263fe..a61c2baf9728 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -52,6 +52,10 @@ inline bool SetupDefaultBlobs(const std::vector& src, for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; bool is_default = nd.storage_type() == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + // We have to make sure it's default storage and default layout. + is_default = nd.IsDefault(); +#endif if (!is_default) { if (idx_map != nullptr) { (*idx_map)[i] = temp_dst->size(); diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index 6d540b1b3c03..1510766639bc 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -32,6 +32,9 @@ #ifdef __CUDACC__ #include "./cast_storage-inl.cuh" #endif // __CUDACC__ +#if MXNET_USE_MKLDNN == 1 +#include "../nn/mkldnn/mkldnn_base-inl.h" +#endif namespace mxnet { @@ -342,6 +345,14 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); +#if MXNET_USE_MKLDNN == 1 + } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) { + // In this case, one of the arrays must use non-default layout. + CHECK(input.IsMKLDNN() || output.IsMKLDNN()); + auto in_mem = input.GetMKLDNNData(); + const_cast(output).CopyFrom(*in_mem); + MKLDNNStream::Get()->Submit(); +#endif } else { LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; } From 79ac700f2e1cf1e0b2029bbfe6d7a55849c8ac91 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 20:15:28 +0000 Subject: [PATCH 247/264] handle ndarray with def layout in mkldnn BN correctly. --- src/operator/nn/batch_norm.cc | 16 ++++--------- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 23 +++++++++++-------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index e64f4ede4a01..55b1969dacbb 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -402,17 +402,9 @@ static inline bool similar_array(const mxnet::NDArray &arr1, #if MXNET_USE_MKLDNN == 1 static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { TShape shape = input.shape(); - bool support = SupportMKLDNN(input) && shape.ndim() == 4 + return SupportMKLDNN(input) && shape.ndim() == 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS && shape[param.axis] % 8 == 0; - if (support) { - // We need to test its data layout. MKLDNN batchnorm doesn't work well on - // the default layout. - auto mem = input.GetMKLDNNData(); - auto desc = mem->get_primitive_desc().desc(); - support = desc.data.format != GetDefaultFormat(desc); - } - return support; } #endif @@ -424,7 +416,8 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, CHECK_EQ(inputs.size(), 5U); #if MXNET_USE_MKLDNN == 1 const BatchNormParam ¶m = nnvm::get(attrs.parsed); - if (SupportMKLDNNBN(inputs[0], param)) { + // MKLDNN batchnorm only works well on the special MKLDNN layout. + if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNN()) { std::vector in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); @@ -461,8 +454,9 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, TShape shape = inputs[0].shape(); #if MXNET_USE_MKLDNN == 1 + // MKLDNN batchnorm only works well on the special MKLDNN layout. if (SupportMKLDNNBN(inputs[0], param) - && inputs[in_data_start].IsMKLDNN()) { + && (inputs[in_data_start].IsMKLDNN() || inputs[0].IsMKLDNN())) { std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); std::vector in_data(inputs.begin() + in_data_start, inputs.begin() + aux_states_start); diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index 6332f0a90f99..6ce1719f8aa2 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -119,8 +119,8 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, if (flags & use_scale_shift) { const NDArray &gamma = in_data[batchnorm::kGamma]; const NDArray &beta = in_data[batchnorm::kBeta]; - CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage); + CHECK(gamma.IsDefault()); + CHECK(beta.IsDefault()); // TODO(tao): how to reuse this memory? std::shared_ptr weight_mem( @@ -178,8 +178,8 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, } else { // training const NDArray &outMean = out_data[batchnorm::kMean]; const NDArray &outVar = out_data[batchnorm::kVar]; - CHECK_EQ(outMean.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(outVar.storage_type(), mxnet::kDefaultStorage); + CHECK(outMean.IsDefault()); + CHECK(outVar.IsDefault()); DType* omean = out_data[batchnorm::kMean].data().dptr(); DType* ovar = out_data[batchnorm::kVar].data().dptr(); @@ -232,16 +232,19 @@ void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam ¶m, const NDArray &out_mean = out_data[batchnorm::kMean]; const NDArray &out_var = out_data[batchnorm::kVar]; - CHECK_EQ(out_mean.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(out_var.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(moving_mean.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(moving_var.storage_type(), mxnet::kDefaultStorage); + CHECK(out_mean.IsDefault()); + CHECK(out_var.IsDefault()); + CHECK(moving_mean.IsDefault()); + CHECK(moving_var.IsDefault()); auto data_mem = data.GetMKLDNNData(); auto diff_mem = diff.GetMKLDNNData(); - if (diff_mem->get_primitive_desc() != data_mem->get_primitive_desc()) { + // MKLDNN batchnorm should run on special layouts. If one of them isn't, we + // should reorder them. + if (data.IsDefault()) data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc()); - } + else if (diff.IsDefault()) + diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc()); auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags); auto gradi_mem = const_cast(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc()); From f8ff37fd240e4e29336d2c8c3681a30e5f3466c4 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 20:16:38 +0000 Subject: [PATCH 248/264] Align to page when mkldnn is enabled. --- src/storage/cpu_device_storage.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h index f0dd61f01ac0..fcea4ef74cdf 100644 --- a/src/storage/cpu_device_storage.h +++ b/src/storage/cpu_device_storage.h @@ -54,7 +54,11 @@ class CPUDeviceStorage { /*! * \brief Alignment of allocation. */ +#if MXNET_USE_MKLDNN == 1 + static constexpr size_t alignment_ = 4096; +#else static constexpr size_t alignment_ = 16; +#endif }; // class CPUDeviceStorage inline void* CPUDeviceStorage::Alloc(size_t size) { From ac546bd5a37f178905143f26ffafde42343a8371 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Mon, 1 Jan 2018 19:37:15 +0000 Subject: [PATCH 249/264] Use default mem alloc for mkldnn. --- include/mxnet/ndarray.h | 21 +++++++++--- src/ndarray/ndarray.cc | 76 ++++++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 40 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index d0bd11ca24f1..5dc91b8674d3 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -487,8 +487,6 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; - // TODO we'll fix it later. - CHECK(!IsMKLDNN()); // We can't reuse memory in a view. CHECK(!IsView()); NDArray ret = *this; @@ -717,7 +715,7 @@ class NDArray { : static_data(false), delay_alloc(false) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); - shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);; + shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); shandle.ctx = ctx; shandle.shared_pid = shared_pid; shandle.shared_id = shared_id; @@ -792,6 +790,9 @@ class NDArray { inline void CheckAndAlloc(void) { if (delay_alloc) { shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + Mkl_mem_ = nullptr; +#endif delay_alloc = false; } } @@ -800,15 +801,22 @@ class NDArray { // size is the number of bytes void CheckAndAlloc(uint64_t dbytes) { CHECK_EQ(kDefaultStorage, storage_type) - << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage"; + << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage"; + dbytes = std::max(dbytes, shandle.size); if (delay_alloc) { shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + Mkl_mem_ = nullptr; +#endif delay_alloc = false; } else if (shandle.size < dbytes) { // free storage if necessary and alloc again if (shandle.size > 0) Storage::Get()->Free(shandle); // init storage shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + Mkl_mem_ = nullptr; +#endif } } @@ -840,6 +848,11 @@ class NDArray { // Have MKL memory reference to the data in the default storage // or create memory for MKLDNN. void SetMKLMem(const TShape &shape, int dtype); + void ResetMKLMem() { + // If Mkl_mem_ isn't referencing to shandle, we need to reset Mkl_mem_. + if (Mkl_mem_ && Mkl_mem_->get_data_handle() != shandle.dptr) + Mkl_mem_ = nullptr; + } // In the data is stored in MKLDNN layout, we reorder data in Mkl_mem_ and // save the result in shandle. void Reorder2Default(); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 88aab4768d01..60ecec849073 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -115,6 +115,10 @@ NDArray::Chunk::~Chunk() { #endif Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { if (skip_free == false) { + if (mem.mem) { + CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size); + CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr); + } if (mem.h.size > 0) Storage::Get()->Free(mem.h); for (size_t i = 0; i < mem.aux_h.size(); i++) { if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]); @@ -132,6 +136,9 @@ void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { if (shandle.size > 0) Storage::Get()->Free(shandle); // init storage shandle = Storage::Get()->Alloc(dbytes, ctx); +#if MXNET_USE_MKLDNN == 1 + Mkl_mem_ = nullptr; +#endif } // init shape storage_shape = shape; @@ -190,7 +197,10 @@ NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const { // when it's destroyed. ret.ptr_->Mkl_mem_ = std::shared_ptr(def_mem, EmptyMKLDNNDeleter()); + ret.ptr_->shandle.dptr = def_mem->get_data_handle(); + ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size(); ret.ptr_->delay_alloc = false; + ret.ptr_->static_data = true; ret.byte_offset_ = byte_offset_; return ret; } @@ -327,13 +337,12 @@ static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::de } bool NDArray::Chunk::IsMKLDNN() const { - // When MKLDNN is enabled, data can be stored in two locations in Chunk: - // shandle or Mkl_mem_. When the data is stored in the default layout, - // the memory should be held by shandle, and Mkl_mem_ references to the - // memory. When the data is stored in special MKLDNN layout, the memory should - // be held by Mkl_mem_. TODO eventually, we want shandle to hold data for both - // cases. - return Mkl_mem_ != nullptr && Mkl_mem_->get_data_handle() != shandle.dptr; + if (storage_type != kDefaultStorage) + return false; + if (Mkl_mem_ == nullptr) + return false; + auto desc = Mkl_mem_->get_primitive_desc().desc(); + return desc.data.format != GetDefaultFormat(desc); } bool NDArray::Chunk::IsDefault() const { @@ -343,13 +352,8 @@ bool NDArray::Chunk::IsDefault() const { // format. if (Mkl_mem_ == nullptr) return true; - if (Mkl_mem_->get_data_handle() == shandle.dptr) { - auto desc = Mkl_mem_->get_primitive_desc().desc(); - CHECK(desc.data.format == GetDefaultFormat(desc)); - return true; - } else { - return false; - } + auto desc = Mkl_mem_->get_primitive_desc().desc(); + return desc.data.format == GetDefaultFormat(desc); } void NDArray::Chunk::Reorder2Default() { @@ -359,22 +363,20 @@ void NDArray::Chunk::Reorder2Default() { auto format = GetDefaultFormat(Mkl_mem_->get_primitive_desc().desc()); CHECK(format != Mkl_mem_->get_primitive_desc().desc().data.format); - CHECK(shandle.dptr == nullptr); - // CheckAndAlloc only allocate memroy if delay_alloc is true. - delay_alloc = true; - CheckAndAlloc(); auto def_pd = GetPrimitiveDesc(Mkl_mem_->get_primitive_desc(), format); - mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd, shandle.dptr)); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::reorder(*Mkl_mem_, *def_mem)); - stream->Submit(); - Mkl_mem_ = nullptr; + mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); + // This may be called in MKLDNN operators. We can't use MKLDNNStream here. + std::vector net; + net.push_back(mkldnn::reorder(*Mkl_mem_, *def_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + + CheckAndAlloc(def_pd.get_size()); + // TODO(zhengda) We need to avoid memory copy here. + memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size()); + Mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr)); } void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - // In this case, data is stored in Mkl_mem_ - if (shandle.dptr == nullptr && Mkl_mem_ != nullptr) - return; // The shape of the array and the one of the MKL memory may mismatch. // For example, if the array stores parameters, the MKL memory may store data // in 5 dimensions while the NDArray stores data in 4 dimensions. @@ -415,8 +417,9 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { CHECK(delay_alloc); CheckAndAlloc(); } - Mkl_mem_.reset(new mkldnn::memory(mkldnn::memory::primitive_desc( - data_md, cpu_engine), shandle.dptr)); + mkldnn::memory::primitive_desc pd(data_md, cpu_engine); + CHECK(shandle.size >= pd.get_size()); + Mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr)); } /* @@ -563,12 +566,10 @@ void NDArray::Reorder(const mkldnn::memory::primitive_desc &pd) { net.push_back(mkldnn::reorder(*old_mem, *new_mem)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - ptr_->Mkl_mem_ = new_mem; - // If the array stores data in the default layout, we should free the memory. - if (ptr_->shandle.dptr) { - Storage::Get()->Free(ptr_->shandle); - ptr_->shandle.dptr = nullptr; - } + ptr_->CheckAndAlloc(pd.get_size()); + // TODO(zhengda) We need to avoid memory copy here. + memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size()); + ptr_->Mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr)); } void NDArray::CopyFrom(const mkldnn::memory &mem) { @@ -681,13 +682,16 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc & return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } + if (ptr_->Mkl_mem_) + CHECK(ptr_->Mkl_mem_->get_data_handle() == ptr_->shandle.dptr); + ptr_->ResetMKLMem(); if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) { MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } - ptr_->Mkl_mem_.reset(new mkldnn::memory(desc)); - ptr_->delay_alloc = false; + ptr_->CheckAndAlloc(desc.get_size() + 4096); + ptr_->Mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); } From fd33a69f17bed177d42c74ac33e43b4ea03563d3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 01:29:39 +0000 Subject: [PATCH 250/264] Reuse NDArrays. --- src/executor/graph_executor.cc | 6 +----- src/imperative/imperative_utils.h | 5 +---- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 65a101dcbebe..0c9b25a05c28 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1220,11 +1220,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { if (storage_type == kDefaultStorage) { CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet"; const NDArray& src = data_pool_.at(storage_id); - // TODO this is a temp fix. - if (src.IsMKLDNN()) - data_entry_[i] = NDArray(vshape[i], data_context[i], true, vdtype[i]); - else - data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); + data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], true, vdtype[i]); diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 218914c2891f..528cd06c4bee 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -739,14 +739,11 @@ inline void AllocateMemory(const nnvm::Graph& g, NDArray buff(TShape({static_cast(mem_plan[i].size)}), default_ctx, true, mshadow::kUint8); *arrays[i] = buff.AsArray(shapes[i], dtypes[i]); - } else if (!arrays[mem_plan[i].sid]->IsMKLDNN()) { - // TODO this is a temp fix. + } else { *arrays[i] = arrays[mem_plan[i].sid]->AsArray(shapes[i], dtypes[i]); if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) { array_reqs->at(i) = kWriteInplace; } - } else { - *arrays[i] = NDArray(shapes[i], default_ctx, true, dtypes[i]); } } else { *arrays[i] = NDArray(static_cast(stypes[i]), From 22fd0e11ddd6ddb523805dc67dbf02598296bacb Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 02:41:34 +0000 Subject: [PATCH 251/264] Support WriteInplace for sum. --- src/operator/nn/mkldnn/mkldnn_base.cc | 8 ++++++++ src/operator/nn/mkldnn/mkldnn_sum.cc | 7 ++++--- src/operator/tensor/elemwise_sum.cc | 2 -- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 9cc59952daf8..11516ba29b8e 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -53,6 +53,11 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, if (kAddTo == req) { auto tmp = TmpMemMgr::Get()->Alloc(desc); return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else if (kWriteInplace == req) { + // MKLDNN ops may not support the case that the input and the output uses + // the same memory. Let's use an extra copy to make sure it always works. + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); } else { mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); if (mem == nullptr) { @@ -70,6 +75,9 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, if (kAddTo == req) { auto tmp = TmpMemMgr::Get()->Alloc(desc); return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else if (kWriteInplace == req) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); } else { auto _desc = desc; auto def_format = GetDefaultFormat(_desc.desc()); diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc index a012617a8bef..1efc285b808f 100644 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -51,6 +51,7 @@ void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const OpReqType &req, const NDArray &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[0]); std::vector in_prims; std::vector in_pds(inputs.size()); std::vector scales(inputs.size()); @@ -62,10 +63,10 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, } mkldnn::sum::primitive_desc pdesc(scales, in_pds); - auto output_memory = const_cast(out_data).CreateMKLDNNData( - pdesc.dst_primitive_desc()); + auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req); MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *output_memory)); + stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second)); + CommitOutput(out_data, out_mem); stream->Submit(); } diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 9beab1cd0f25..926189fc77f7 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -121,8 +121,6 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); #if MXNET_USE_MKLDNN == 1 } else if (IsMKLDNN(inputs)) { - CHECK_EQ(req[0], kWriteTo) - << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; MKLDNNSumForward(attrs, op_ctx, inputs, req[0], outputs[0]); #endif } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { From 4eeffc97cfd2b0c194cdb8d6f6c21e47f2fcf302 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 03:28:16 +0000 Subject: [PATCH 252/264] fix complains from "make lint". --- include/mxnet/ndarray.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 5dc91b8674d3..c6fb64a1688f 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -35,6 +35,7 @@ #include #include #include +#include #if MXNET_USE_MKLDNN == 1 #include #endif From f4b73db9e46968ce727293a3274408d52704f4ff Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 19:00:54 +0000 Subject: [PATCH 253/264] Avoid reallocation in NDArray. --- src/ndarray/ndarray.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 60ecec849073..879f0e2d8d6e 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -370,6 +370,7 @@ void NDArray::Chunk::Reorder2Default() { net.push_back(mkldnn::reorder(*Mkl_mem_, *def_mem)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + CHECK(shandle.size >= def_pd.get_size()); CheckAndAlloc(def_pd.get_size()); // TODO(zhengda) We need to avoid memory copy here. memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size()); @@ -566,6 +567,7 @@ void NDArray::Reorder(const mkldnn::memory::primitive_desc &pd) { net.push_back(mkldnn::reorder(*old_mem, *new_mem)); mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + CHECK(ptr_->shandle.size >= pd.get_size()); ptr_->CheckAndAlloc(pd.get_size()); // TODO(zhengda) We need to avoid memory copy here. memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size()); @@ -690,7 +692,8 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc & return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc); } - ptr_->CheckAndAlloc(desc.get_size() + 4096); + CHECK(ptr_->shandle.size >= desc.get_size()); + ptr_->CheckAndAlloc(desc.get_size()); ptr_->Mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_); return ptr_->Mkl_mem_.get(); From ac8f9fdd2196108f985138ad1c1a22f53a9bb992 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 21:01:03 +0000 Subject: [PATCH 254/264] Handle weight arrays with special MKLDNN layouts. --- src/operator/nn/mkldnn/mkldnn_base.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 11516ba29b8e..6e08a2502895 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -111,7 +111,12 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, int num_groups) { - const mkldnn::memory *mem; + const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd); + // If the weight array already uses the target layout, simply return it + // directly. + if (mem) + return mem; + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); auto engine = CpuEngine::Get()->get_engine(); if (arr.shape().ndim() == 2) { @@ -146,6 +151,8 @@ const mkldnn::memory *GetWeights(const NDArray &arr, LOG(FATAL) << "The weight array has an unsupported number of dimensions"; return nullptr; } + if (mem == nullptr) + mem = arr.GetMKLDNNDataReorder(target_pd); if (mem->get_primitive_desc() == target_pd) return mem; auto ret = TmpMemMgr::Get()->Alloc(target_pd); From 24200a07499c7b13f915eac3586905723ed9abf3 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 21:03:15 +0000 Subject: [PATCH 255/264] Remove unnecessary GetWeights. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 3 -- src/operator/nn/mkldnn/mkldnn_base.cc | 38 ------------------------ 2 files changed, 41 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 4bf23a81f532..f53854d3f371 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -389,9 +389,6 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); const mkldnn::memory *GetWeights(const NDArray &arr, const mkldnn::memory::primitive_desc &target_pd, int num_groups); -const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::engine &engine, - int num_groups = 1); mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc); mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 6e08a2502895..7b5d9cb6706a 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -160,44 +160,6 @@ const mkldnn::memory *GetWeights(const NDArray &arr, return ret; } -const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::engine &engine, - int num_groups) { - mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); - if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, - static_cast(arr.shape()[0] / num_groups), - static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), - static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - return arr.GetMKLDNNData(pd); - } else { - LOG(FATAL) << "The weight array has an unsupported number of dimensions"; - return nullptr; - } -} - mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { if (desc.data.ndims == 1) { return desc.data.format; From 19d87498b7ac9120d56d4d022c79ae54f998d7cf Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 2 Jan 2018 21:23:33 +0000 Subject: [PATCH 256/264] Fix compilation error without MKLDNN. --- src/ndarray/ndarray.cc | 2 ++ src/operator/tensor/elemwise_sum.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 879f0e2d8d6e..b28998fb9662 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -115,10 +115,12 @@ NDArray::Chunk::~Chunk() { #endif Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { if (skip_free == false) { +#if MXNET_USE_MKLDNN == 1 if (mem.mem) { CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size); CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr); } +#endif if (mem.h.size > 0) Storage::Get()->Free(mem.h); for (size_t i = 0; i < mem.aux_h.size(); i++) { if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]); diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 926189fc77f7..59a284aea6e1 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -94,6 +94,7 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, return ret; } +#if MXNET_USE_MKLDNN == 1 static inline bool IsMKLDNN(const std::vector &arrs) { for (auto &arr : arrs) { if (!arr.IsMKLDNN()) @@ -101,6 +102,7 @@ static inline bool IsMKLDNN(const std::vector &arrs) { } return true; } +#endif void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, const OpContext& op_ctx, From 18236fc33948ce68572ee380bb9c5af5e2741973 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 3 Jan 2018 23:28:58 +0000 Subject: [PATCH 257/264] Fix a bug in (de)conv for weight arrays. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 4 ++++ src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 3aed59aab4ab..8ba0cd1c5e6a 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -269,6 +269,10 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); const mkldnn::memory *weight_mem; if (ctx.is_train) { + // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it + // to the default format for now. + if (in_data[conv::kWeight].IsMKLDNN()) + const_cast(in_data[conv::kWeight]).Reorder2Default(); weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), param.num_group); } else { diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 12233b6cbce1..dc22437e68df 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -165,6 +165,10 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &c deconvFwd_pd.diff_dst_primitive_desc()); const mkldnn::memory *weight_mem; if (ctx.is_train) { + // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it + // to the default format for now. + if (in_data[deconv::kWeight].IsMKLDNN()) + const_cast(in_data[deconv::kWeight]).Reorder2Default(); weight_mem = GetWeights(in_data[deconv::kWeight], deconvFwd_pd.weights_primitive_desc(), param.num_group); From 1cd8bad98d7a18359d418f940ed3f528d748ce90 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Wed, 3 Jan 2018 23:29:34 +0000 Subject: [PATCH 258/264] Fix a minor bug in MKLDNN conv. --- src/operator/nn/mkldnn/mkldnn_convolution.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 8ba0cd1c5e6a..f10ff0f674a2 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -344,9 +344,9 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, *in_grad_bias.second)); + CommitOutput(in_grad[conv::kBias], in_grad_bias); } CommitOutput(in_grad[conv::kWeight], in_grad_weight); - CommitOutput(in_grad[conv::kBias], in_grad_bias); } MKLDNNStream::Get()->Submit(); } From 9b3c8b2ba940e8657dd61451f9bc072c44ae0de2 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Thu, 4 Jan 2018 02:09:19 +0000 Subject: [PATCH 259/264] Avoid caching TBlob from NDArray. This commit may add some overhead of managing NDArray for each fallback. --- include/mxnet/ndarray.h | 6 +++ src/common/exec_utils.h | 62 +++++++++++++++++++++++----- src/executor/attach_op_execs_pass.cc | 31 ++++++++++---- src/imperative/imperative_utils.h | 12 +++--- 4 files changed, 85 insertions(+), 26 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index c6fb64a1688f..4f6d295230e4 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -608,6 +608,12 @@ class NDArray { ptr_->Reorder2Default(); } + void InvalidateData() { + CHECK_EQ(storage_type(), kDefaultStorage); + // When we invalidate data, we don't need to care about the MKLDNN format. + ptr_->Mkl_mem_ = nullptr; + } + /* * This function is used inside operators to reshape an array. * It's used by FullyConnected right now. diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index a61c2baf9728..2867a7ab47d8 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -43,11 +43,12 @@ namespace common { indices are not recorded * \return true if any source NDArray need to cast storage */ -inline bool SetupDefaultBlobs(const std::vector& src, - std::vector *blobs, - std::vector *temp_src, - std::vector *temp_dst, - std::unordered_map *idx_map = nullptr) { +inline bool SetupDefaultBlobsIn(const std::vector& src, + const std::vector *bufs, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst, + std::unordered_map *idx_map) { bool require_cast = false; for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; @@ -57,10 +58,44 @@ inline bool SetupDefaultBlobs(const std::vector& src, is_default = nd.IsDefault(); #endif if (!is_default) { - if (idx_map != nullptr) { - (*idx_map)[i] = temp_dst->size(); - } - NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype()); + (*idx_map)[i] = temp_dst->size(); + NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype()); +#if MXNET_USE_MKLDNN == 1 + CHECK(temp.IsDefault()); +#endif + temp_src->emplace_back(nd); + temp_dst->emplace_back(temp); + blobs->emplace_back(temp.data()); + require_cast = true; + } else { + blobs->push_back(nd.data()); + } + } + return require_cast; +} + +inline bool SetupDefaultBlobsOut(const std::vector& src, + const std::vector &req, + const std::vector *bufs, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst) { + bool require_cast = false; + for (size_t i = 0; i < src.size(); i++) { + auto& nd = src[i]; + bool is_default = nd.storage_type() == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + // If it's writeTo, we don't need to worry whether it contains valid data. + if (req[i] == kWriteTo) + const_cast(nd).InvalidateData(); + // We have to make sure it's default storage and default layout. + is_default = nd.IsDefault(); +#endif + if (!is_default) { + NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype()); +#if MXNET_USE_MKLDNN == 1 + CHECK(temp.IsDefault()); +#endif temp_src->emplace_back(nd); temp_dst->emplace_back(temp); blobs->emplace_back(temp.data()); @@ -81,6 +116,9 @@ inline bool SetupDefaultBlobs(const std::vector& src, */ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, const std::vector &ndoutputs, + const std::vector &req, + const std::vector *in_bufs, + const std::vector *out_bufs, std::vector *input_blobs, std::vector *output_blobs, std::vector *pre_temp_src, @@ -90,9 +128,11 @@ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, std::unordered_map *in_temp_idx_map, const std::vector &mutate_idx) { // populate input blobs - SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map); + SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst, + in_temp_idx_map); // populate output blobs - SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src); + SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst, + post_temp_src); // add mutable inputs to post temp list for (const auto idx : mutate_idx) { auto map_iter = in_temp_idx_map->find(idx); diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 6bcfd6fcf614..f6534fc82398 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -54,23 +54,34 @@ class StorageFallbackOpExecutor : public OpExecutor { protected: // initialize the data blobs void InitBlobs() { - using namespace common; if (!init_) { - in_data_.clear(); out_data_.clear(); - pre_temp_src_.clear(); pre_temp_dst_.clear(); - post_temp_src_.clear(); post_temp_dst_.clear(); - in_temp_idx_map_.clear(); - SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_, - &pre_temp_src_, &pre_temp_dst_, - &post_temp_src_, &post_temp_dst_, - &in_temp_idx_map_, mutate_idx_); + pre_temp_buf_.clear(); + post_temp_buf_.clear(); + for (size_t i = 0; i < in_array.size(); i++) { + auto &nd = in_array[i]; + pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); + } + for (size_t i = 0; i < out_array.size(); i++) { + auto &nd = out_array[i]; + post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); + } init_ = true; } } // storage fallback before fcompute is launched void PreFCompute(bool is_gpu) { + using namespace common; InitBlobs(); + in_data_.clear(); out_data_.clear(); + pre_temp_src_.clear(); pre_temp_dst_.clear(); + post_temp_src_.clear(); post_temp_dst_.clear(); + in_temp_idx_map_.clear(); + SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_, + &in_data_, &out_data_, + &pre_temp_src_, &pre_temp_dst_, + &post_temp_src_, &post_temp_dst_, + &in_temp_idx_map_, mutate_idx_); common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu); } @@ -81,6 +92,8 @@ class StorageFallbackOpExecutor : public OpExecutor { // default storage tensor blobs for fcompute std::vector in_data_, out_data_; + // These are NDArray buffers for cast storage. + std::vector pre_temp_buf_, post_temp_buf_; // source NDArray for cast storage std::vector pre_temp_src_, post_temp_src_; // destination NDArray for cast storage diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index ceb5ed5be892..4e3624b81010 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -353,9 +353,9 @@ inline void PushFCompute(const FCompute& fn, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // setup blobs - SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, - &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst, - &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, + &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, + &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); // setup context OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; bool is_gpu = ctx.dev_mask() == gpu::kDevMask; @@ -467,9 +467,9 @@ inline void PushOperator(const OpStatePtr& state, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // populate input blobs and output blobs - SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, - &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst, - &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, + &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, + &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); // setup contexts bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask; // pre-fcompute fallback From c426bfa37cca60d81434a170388cb17f13fc5550 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Sat, 6 Jan 2018 01:58:33 +0000 Subject: [PATCH 260/264] Fix a bug in MKLDNNOpSignature. --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index f53854d3f371..b34a9bcd32a3 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -281,6 +281,10 @@ class MKLDNNOpSignature { uint64_t hash; public: + MKLDNNOpSignature() { + hash = 0; + } + /* * We provide different methods to add signature to an op. * For operations, such as convolutin and fully connected, which determines From 5825191fd80d011c713e6f41674a25399e042f82 Mon Sep 17 00:00:00 2001 From: Lv Tao Date: Mon, 8 Jan 2018 16:55:55 +0800 Subject: [PATCH 261/264] 1. Fix coding style in BatchNorm; 2. Add memory into signature; 3. Try to split BatchNorm into .h file and .cc file. Will finish it after backward code is refactored. --- src/operator/nn/batch_norm.cc | 47 ++-- .../nn/mkldnn/mkldnn_batch_norm-inl.h | 203 +++--------------- src/operator/nn/mkldnn/mkldnn_batch_norm.cc | 176 +++++++++++++++ 3 files changed, 221 insertions(+), 205 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_batch_norm.cc diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index 55b1969dacbb..96c1444857f2 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -382,23 +382,6 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs, return true; } -static inline bool similar_array(const mxnet::NDArray &arr1, - const mxnet::NDArray &arr2, - float tol) { - float *data1 = reinterpret_cast(arr1.data().dptr_); - float *data2 = reinterpret_cast(arr2.data().dptr_); - if (arr1.shape().Size() != arr2.shape().Size()) - return false; - for (size_t i = 0; i < arr1.shape().Size(); i++) { - if (std::abs(data1[i] - data2[i]) > tol) { - // printf("similar_array: %.8f, %.8f \n", data1[i], data2[i]); - return false; - } - } - std::cout << "similar_array: passed all check, tol=" << tol << std::endl; - return true; -} - #if MXNET_USE_MKLDNN == 1 static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { TShape shape = input.shape(); @@ -408,11 +391,11 @@ static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &p } #endif -void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void BatchNormComputeCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { CHECK_EQ(inputs.size(), 5U); #if MXNET_USE_MKLDNN == 1 const BatchNormParam ¶m = nnvm::get(attrs.parsed); @@ -423,7 +406,7 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, switch (inputs[0].dtype()) { case mshadow::kFloat32: - MKLDNNBatchNormForward(ctx, param, in_data, req, outputs, aux_states); + MKLDNNBatchNormCompute(ctx, param, in_data, req, outputs, aux_states); return; } } @@ -440,11 +423,11 @@ void BatchNormCompute_CPU(const nnvm::NodeAttrs &attrs, BatchNormCompute(attrs, ctx, in_blobs, req, out_blobs); } -void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +void BatchNormGradComputeCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { CHECK_EQ(inputs.size(), 11U); const BatchNormParam ¶m = nnvm::get(attrs.parsed); int num_out_grads = param.output_mean_var ? 3U : 1U; @@ -466,8 +449,8 @@ void BatchNormGradCompute_CPU(const nnvm::NodeAttrs &attrs, std::vector in_grad(outputs.begin(), outputs.begin() + 3); if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNNBatchNormBackward(ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); + MKLDNNBatchNormGradCompute(ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); return; } } @@ -598,7 +581,7 @@ then set ``gamma`` to 1 and its gradient to 0. .set_attr("FInferType", BatchNormType) .set_attr("FInferStorageType", BatchNormStorageType) .set_attr("FCompute", BatchNormCompute) -.set_attr("FComputeEx", BatchNormCompute_CPU) +.set_attr("FComputeEx", BatchNormComputeCPU) .set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) #if MXNET_USE_MKLDNN == 1 .set_attr("FResourceRequest", [](const NodeAttrs& n) { @@ -633,7 +616,7 @@ NNVM_REGISTER_OP(_backward_BatchNorm) #endif .set_attr_parser(ParamParser) .set_attr("FCompute", BatchNormGradCompute) -.set_attr("FComputeEx", BatchNormGradCompute_CPU); +.set_attr("FComputeEx", BatchNormGradComputeCPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h index e18d2c98fe5c..9f5dc5bc65fd 100644 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -98,20 +98,20 @@ inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, } template -class MKLDNNBNForward { +class MKLDNNBatchNormFwd { public: - MKLDNNBNForward(const mxnet::NDArray &data, DType eps, - bool is_train, bool scale_shift, - bool global_stats, bool fix_gamma) : - _out_mean(nullptr), _out_var(nullptr), - _flag(0U), _fix_gamma(fix_gamma), _is_train(is_train), - _channels(data.shape()[1]), _eps(eps), - fwd(nullptr), data(nullptr), weight(nullptr), - out(nullptr), mean(nullptr), variance(nullptr) { + MKLDNNBatchNormFwd(const mxnet::NDArray &data, DType eps, + bool is_train, bool scale_shift, + bool global_stats, bool fix_gamma) : + _out_mean(nullptr), _out_var(nullptr), + _flag(0U), _fix_gamma(fix_gamma), _is_train(is_train), + _channels(data.shape()[1]), _eps(eps), + fwd(nullptr), data(nullptr), weight(nullptr), + out(nullptr), mean(nullptr), variance(nullptr) { _Init(data, scale_shift, global_stats); } - ~MKLDNNBNForward() {} + ~MKLDNNBatchNormFwd() {} void SetDataHandle(const std::vector &req, const mxnet::NDArray &data, @@ -153,165 +153,22 @@ class MKLDNNBNForward { }; template -void MKLDNNBNForward::_Init(const mxnet::NDArray &src, bool scale_shift, bool global_stats) { - this->_flag |= scale_shift ? use_scale_shift : 0U; - this->_flag |= global_stats ? use_global_stats : 0U; - - auto src_md = src.GetMKLDNNData()->get_primitive_desc().desc(); - auto engine = CpuEngine::Get()->get_engine(); - - mkldnn::prop_kind prop = forward_training; - if (this->_is_train) { - prop = forward_training; - } else { - prop = forward_inference; - } - - auto fwd_desc = t_bn_f_desc(prop, src_md, this->_eps, this->_flag); - auto fwd_pd = t_bn_f_pdesc(fwd_desc, engine); - - this->data.reset(new mkldnn::memory(src.GetMKLDNNData()->get_primitive_desc())); - this->out.reset(new mkldnn::memory(fwd_pd.dst_primitive_desc())); - - if (this->_flag & use_scale_shift) { - this->weight.reset(new memory(fwd_pd.weights_primitive_desc())); - } - - if (this->_is_train || (this->_flag & use_global_stats)) { - this->mean.reset(new mkldnn::memory(fwd_pd.mean_primitive_desc())); - this->variance.reset(new mkldnn::memory(fwd_pd.variance_primitive_desc())); - } - - // for mxnet, there always has weight - CHECK_EQ(this->_flag & use_scale_shift, use_scale_shift); - if (!(this->_is_train)) { - this->fwd.reset( - new mkldnn::batch_normalization_forward(fwd_pd, - *(this->data), - mkldnn::primitive::at(*(this->mean)), - mkldnn::primitive::at(*(this->variance)), - mkldnn::primitive::at(*(this->weight)), - *(this->out))); - } else { - this->fwd.reset( - new mkldnn::batch_normalization_forward(fwd_pd, - *(this->data), - mkldnn::primitive::at(*(this->weight)), - *(this->out), - *(this->mean), - *(this->variance))); - } - return; -} - -template -void MKLDNNBNForward::SetDataHandle(const std::vector &req, - const mxnet::NDArray &data, - const mxnet::NDArray &output, - const mxnet::TBlob &moving_mean, - const mxnet::TBlob &moving_var, - const mxnet::TBlob &out_mean, - const mxnet::TBlob &out_var, - const mxnet::TBlob *gamma, - const mxnet::TBlob *beta) { - auto data_mem = data.GetMKLDNNData(); - auto out_mem = const_cast(output).CreateMKLDNNData(this->out->get_primitive_desc()); - this->data->set_data_handle(data_mem->get_data_handle()); - this->out->set_data_handle(out_mem->get_data_handle()); - - // weights - if (gamma != nullptr && beta != nullptr && (this->_flag | use_scale_shift)) { - _SetWeight(*gamma, *beta, req[batchnorm::kGamma]); - } - - // mean and variance - this->_out_mean = out_mean.dptr(); - this->_out_var = out_var.dptr(); - if (!(this->_is_train)) { - this->mean->set_data_handle(moving_mean.dptr()); - this->variance->set_data_handle(moving_var.dptr()); - } else { - this->mean->set_data_handle(this->_out_mean); - this->variance->set_data_handle(this->_out_var); - } -} - -template -void MKLDNNBNForward::Execute() { - if (!(this->_is_train)) { - MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); - MKLDNNStream::Get()->Submit(); - _SetMeanVar(reinterpret_cast(this->mean->get_data_handle()), - reinterpret_cast(this->variance->get_data_handle()), - this->_out_mean, this->_out_var); - } else { - MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); - MKLDNNStream::Get()->Submit(); - _SetMeanVar(reinterpret_cast(this->mean->get_data_handle()), - reinterpret_cast(this->variance->get_data_handle()), - this->_out_mean, this->_out_var); - } -} - -template -void MKLDNNBNForward::_SetWeight(const mxnet::TBlob &gamma, - const mxnet::TBlob &beta, - const OpReqType &req) { - // CHECK_NE(this->weight, nullptr); - DType *gamma_ptr = gamma.dptr(); - DType *beta_ptr = beta.dptr(); - DType *weight_ptr = reinterpret_cast(this->weight->get_data_handle()); - - if (!(this->_fix_gamma)) { -#pragma omp parallel for simd - for (int i = 0; i < this->_channels; i++) { - weight_ptr[i] = gamma_ptr[i]; - weight_ptr[this->_channels + i] = beta_ptr[i]; // bias - } - } else if (IsBNWriting(req)) { -#pragma omp parallel for simd - for (int i = 0; i < this->_channels; i++) { - weight_ptr[i] = (DType)1.0f; - weight_ptr[this->_channels + i] = beta_ptr[i]; // bias - gamma_ptr[i] = (DType)1.0f; - } - } else { -#pragma omp parallel for simd - for (int i = 0; i < this->_channels; i++) { - weight_ptr[i] = (DType)1.0f; - weight_ptr[this->_channels + i] = beta_ptr[i]; // bias - } - } -} - -template -void MKLDNNBNForward::_SetMeanVar(const DType *imean, - const DType *ivar, - DType *omean, - DType *ovar) { -#pragma omp parallel for simd - for (int i = 0; i < this->_channels; i++) { - omean[i] = imean[i]; - ovar[i] = VARIANCE_TO_INVSTD(ivar[i], this->_eps); - } -} - -template -static inline MKLDNNBNForward &GetBNFwd(const BatchNormParam ¶m, - bool is_train, - const NDArray &data) { +static inline MKLDNNBatchNormFwd &GetBatchNormFwd(const BatchNormParam ¶m, + bool is_train, + const NDArray &data) { static thread_local std::unordered_map, - MKLDNNOpHash> fwds; + MKLDNNBatchNormFwd, + MKLDNNOpHash> bn_fwds; MKLDNNBNSignature key(param); key.AddSign(is_train); - key.AddSign(data); + key.AddSign(*(data.GetMKLDNNData())); - auto it = fwds.find(key); - if (it == fwds.end()) { - MKLDNNBNForward fwd(data, param.eps, is_train, true, + auto it = bn_fwds.find(key); + if (it == bn_fwds.end()) { + MKLDNNBatchNormFwd fwd(data, param.eps, is_train, true, param.use_global_stats, param.fix_gamma); - auto ins_ret = fwds.insert(std::pair >(key, fwd)); + auto ins_ret = bn_fwds.insert( + std::pair >(key, fwd)); CHECK(ins_ret.second); it = ins_ret.first; } @@ -319,7 +176,7 @@ static inline MKLDNNBNForward &GetBNFwd(const BatchNormParam ¶m, } template -void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, +void MKLDNNBatchNormCompute(const OpContext &ctx, const BatchNormParam ¶m, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -334,20 +191,20 @@ void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, auto out_mean = out_data[batchnorm::kMean].data(); auto out_var = out_data[batchnorm::kVar].data(); - MKLDNNBNForward &fwd = GetBNFwd(param, ctx.is_train, data); + MKLDNNBatchNormFwd &fwd = GetBatchNormFwd(param, ctx.is_train, data); fwd.SetDataHandle(req, data, out, moving_mean, moving_var, out_mean, out_var, &gamma, &beta); fwd.Execute(); } template -void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void MKLDNNBatchNormGradCompute(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); CHECK_EQ(in_data.size(), 3U); diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm.cc b/src/operator/nn/mkldnn/mkldnn_batch_norm.cc new file mode 100644 index 000000000000..b40a9f3b290e --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm.cc @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_batch_norm.cc + * \brief + * \author Tao Lv (tao.a.lv@intel.com) +*/ + +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn_batch_norm-inl.h" + +namespace mxnet { +namespace op { + +template +void MKLDNNBatchNormFwd::_Init(const mxnet::NDArray &src, + bool scale_shift, + bool global_stats) { + this->_flag |= scale_shift ? use_scale_shift : 0U; + // this->_flag |= global_stats ? use_global_stats : 0U; + if (!(this->_is_train)) + this->_flag |= use_global_stats; + + auto src_md = src.GetMKLDNNData()->get_primitive_desc().desc(); + auto engine = CpuEngine::Get()->get_engine(); + + mkldnn::prop_kind prop = forward_training; + if (this->_is_train) { + prop = forward_training; + } else { + prop = forward_inference; + } + + auto fwd_desc = t_bn_f_desc(prop, src_md, this->_eps, this->_flag); + auto fwd_pd = t_bn_f_pdesc(fwd_desc, engine); + + this->data.reset(new mkldnn::memory(src.GetMKLDNNData()->get_primitive_desc())); + this->out.reset(new mkldnn::memory(fwd_pd.dst_primitive_desc())); + + if (this->_flag & use_scale_shift) { + this->weight.reset(new memory(fwd_pd.weights_primitive_desc())); + } + + if (this->_is_train || (this->_flag & use_global_stats)) { + this->mean.reset(new mkldnn::memory(fwd_pd.mean_primitive_desc())); + this->variance.reset(new mkldnn::memory(fwd_pd.variance_primitive_desc())); + } + + // for mxnet, there always has weight + CHECK_EQ(this->_flag & use_scale_shift, use_scale_shift); + if (!(this->_is_train)) { + this->fwd.reset( + new mkldnn::batch_normalization_forward(fwd_pd, + *(this->data), + mkldnn::primitive::at(*(this->mean)), + mkldnn::primitive::at(*(this->variance)), + mkldnn::primitive::at(*(this->weight)), + *(this->out))); + } else { + this->fwd.reset( + new mkldnn::batch_normalization_forward(fwd_pd, + *(this->data), + mkldnn::primitive::at(*(this->weight)), + *(this->out), + *(this->mean), + *(this->variance))); + } + return; +} + +template +void MKLDNNBatchNormFwd::SetDataHandle(const std::vector &req, + const mxnet::NDArray &data, + const mxnet::NDArray &output, + const mxnet::TBlob &moving_mean, + const mxnet::TBlob &moving_var, + const mxnet::TBlob &out_mean, + const mxnet::TBlob &out_var, + const mxnet::TBlob *gamma, + const mxnet::TBlob *beta) { + auto data_mem = data.GetMKLDNNData(); + auto out_mem = const_cast(output).CreateMKLDNNData(this->out->get_primitive_desc()); + this->data->set_data_handle(data_mem->get_data_handle()); + this->out->set_data_handle(out_mem->get_data_handle()); + + // weights + if (gamma != nullptr && beta != nullptr && (this->_flag | use_scale_shift)) { + _SetWeight(*gamma, *beta, req[batchnorm::kGamma]); + } + + // mean and variance + this->_out_mean = out_mean.dptr(); + this->_out_var = out_var.dptr(); + if (!(this->_is_train)) { + this->mean->set_data_handle(moving_mean.dptr()); + this->variance->set_data_handle(moving_var.dptr()); + } else { + this->mean->set_data_handle(this->_out_mean); + this->variance->set_data_handle(this->_out_var); + } +} + +template +void MKLDNNBatchNormFwd::Execute() { + MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); + MKLDNNStream::Get()->Submit(); + _SetMeanVar(reinterpret_cast(this->mean->get_data_handle()), + reinterpret_cast(this->variance->get_data_handle()), + this->_out_mean, this->_out_var); +} + +template +void MKLDNNBatchNormFwd::_SetWeight(const mxnet::TBlob &gamma, + const mxnet::TBlob &beta, + const OpReqType &req) { + // CHECK_NE(this->weight, nullptr); + DType *gamma_ptr = gamma.dptr(); + DType *beta_ptr = beta.dptr(); + DType *weight_ptr = reinterpret_cast(this->weight->get_data_handle()); + // std::cout << "_SetWeight: channel size: " << this->_channels << std::endl; + if (!(this->_fix_gamma)) { +#pragma omp parallel for simd + for (int i = 0; i < this->_channels; i++) { + weight_ptr[i] = gamma_ptr[i]; + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias + } + } else if (IsBNWriting(req)) { +#pragma omp parallel for simd + for (int i = 0; i < this->_channels; i++) { + weight_ptr[i] = (DType)1.0f; + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias + gamma_ptr[i] = (DType)1.0f; + } + } else { +#pragma omp parallel for simd + for (int i = 0; i < this->_channels; i++) { + weight_ptr[i] = (DType)1.0f; + weight_ptr[this->_channels + i] = beta_ptr[i]; // bias + } + } +} + +template +void MKLDNNBatchNormFwd::_SetMeanVar(const DType *imean, + const DType *ivar, + DType *omean, + DType *ovar) { + float e = this->_eps; +#pragma omp parallel for firstprivate(e) + for (int i = 0; i < this->_channels; i++) { + omean[i] = imean[i]; + ovar[i] = VARIANCE_TO_INVSTD(ivar[i], e); + } +} + +template class MKLDNNBatchNormFwd; +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN From 7c957ef6265aa159fe64d589dbf9d2630f43eaef Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 9 Jan 2018 20:51:00 -0800 Subject: [PATCH 262/264] Update mkldnn_base-inl.h --- src/operator/nn/mkldnn/mkldnn_base-inl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index b34a9bcd32a3..846f9596eea6 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -380,6 +380,8 @@ typedef std::pair mkldnn_output_t; * The difference is that the first function can create MKLDNN memory with * special layouts in an NDArray, while the second one can only create MKLDNN * memory with default layouts. + * If these two functions are used, we have to call CommitOutput to write + * the output back to the output NDArray. */ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, const mkldnn::memory::primitive_desc &desc, @@ -387,7 +389,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, const mkldnn::memory::primitive_desc &desc, OpReqType req); - +/* This function has to be used with one of the functions above. */ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); const mkldnn::memory *GetWeights(const NDArray &arr, From ec9a2c758718c5dd3bcb199b23a4914e21c68c72 Mon Sep 17 00:00:00 2001 From: Jin Huang Date: Thu, 11 Jan 2018 20:45:10 +0800 Subject: [PATCH 263/264] Implement primitive cache for FullyConnected OP and class refact --- src/operator/nn/fully_connected-inl.h | 5 + src/operator/nn/fully_connected.cc | 24 +- .../nn/mkldnn/mkldnn_fully_connected.cc | 214 ++++++++++++++---- src/operator/nn/mkldnn/mkldnn_ops-inl.h | 6 +- 4 files changed, 199 insertions(+), 50 deletions(-) diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index e8e95643e647..e7d1a25b79ab 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -60,6 +60,11 @@ struct FullyConnectedParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(flatten).set_default(true) .describe("Whether to collapse all but the first axis of the input data tensor."); } + bool operator==(const FullyConnectedParam& other) const { + return this->num_hidden == other.num_hidden && + this->no_bias == other.no_bias && + this->flatten == other.flatten; + } }; template diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 31098413929b..abca564c4319 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -73,12 +73,14 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, return true; } -void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const std::vector &req, - const std::vector &outputs) { +void FullyConnectedComputeCPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNFCForward(attrs, ctx, inputs, req, outputs); + MKLDNNFullyConnectedCompute(attrs, ctx, inputs, req, outputs); return; } #endif @@ -91,12 +93,14 @@ void FullyConnectedCompute_CPU(const nnvm::NodeAttrs& attrs, const OpContext &ct FullyConnectedCompute(attrs, ctx, in_blobs, req, out_blobs); } -void FullyConnectedGradCompute_CPU(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, const std::vector &inputs, - const std::vector &req, const std::vector &outputs) { +void FullyConnectedGradComputeCPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { #if MXNET_USE_MKLDNN == 1 if (SupportMKLDNN(inputs[0])) { - MKLDNNFCBackward(attrs, ctx, inputs, req, outputs); + MKLDNNFullyConnectedGradCompute(attrs, ctx, inputs, req, outputs); return; } #endif @@ -215,7 +219,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored. .set_attr("FInferShape", FullyConnectedShape) .set_attr("FInferType", FullyConnectedType) .set_attr("FCompute", FullyConnectedCompute) -.set_attr("FComputeEx", FullyConnectedCompute_CPU) +.set_attr("FComputeEx", FullyConnectedComputeCPU) .set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) .add_argument("data", "NDArray-or-Symbol", "Input data.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") @@ -240,7 +244,7 @@ NNVM_REGISTER_OP(_backward_FullyConnected) .set_attr("FInferStorageType", BackwardFCStorageType) .set_attr_parser(ParamParser) .set_attr("FCompute", FullyConnectedGradCompute) -.set_attr("FComputeEx", FullyConnectedGradCompute_CPU); +.set_attr("FComputeEx", FullyConnectedGradComputeCPU); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 451b94060a41..42477fe89c2e 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -80,54 +80,192 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei } } -void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - const TShape& ishape = in_data[fullc::kData].shape(); - const TShape& oshape = out_data[fullc::kOut].shape(); - NDArray weight = in_data[fullc::kWeight]; - NDArray data = in_data[fullc::kData]; - auto out_md = GetMemDesc(out_data[fullc::kOut]); - if (data.shape().ndim() != 2 && !param.flatten) { - data = data.ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), +typedef MKLDNNParamOpSign MKLDNNFullyConnectedSignature; + +class MKLDNNFullyConnectedFwd { + public: + std::shared_ptr fwd_pd; + MKLDNNFullyConnectedFwd(const FullyConnectedParam ¶m, + NDArray *data, const NDArray &weights, + const NDArray *bias, const NDArray &output, + const OpReqType &req_out) { + _Init(param, data, weights, bias, output, req_out); + } + ~MKLDNNFullyConnectedFwd() {} + void SetDataHandle(const FullyConnectedParam ¶m, + NDArray *data, + const NDArray &weights, + const NDArray *bias, + const NDArray &output, + const OpReqType &req_out); + void Execute(const NDArray &output, const OpReqType &req_out); + + private: + void _Init(const FullyConnectedParam ¶m, NDArray *data, + const NDArray &weights, const NDArray *bias, + const NDArray &output, const OpReqType &req_out); + + private: + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr weights; + std::shared_ptr bias; + std::shared_ptr out; +}; + +void MKLDNNFullyConnectedFwd::_Init(const FullyConnectedParam ¶m, + NDArray *data, + const NDArray &weights, + const NDArray *bias, + const NDArray &output, + const OpReqType &req_out) { + const TShape& ishape = data->shape(); + const TShape& oshape = output.shape(); + auto out_md = GetMemDesc(output); + if (data->shape().ndim() != 2 && !param.flatten) { + *data = data->ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1])); - mkldnn::memory::dims out_dims{static_cast(oshape.ProdShape(0, oshape.ndim()-1)), - static_cast(oshape[ishape.ndim()-1])}; - out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), - mkldnn::memory::format::any); - } else if (data.shape().ndim() != 2) { - data = data.ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + mkldnn::memory::dims out_dims{static_cast(oshape.ProdShape(0, + oshape.ndim()-1)), + static_cast(oshape[ishape.ndim()-1])}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(output.dtype()), + mkldnn::memory::format::any); + } else if (data->shape().ndim() != 2) { + *data = data->ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1, + ishape.ndim()))); mkldnn::memory::dims out_dims{static_cast(oshape[0]), - static_cast(oshape.ProdShape(1, oshape.ndim()))}; - out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), - mkldnn::memory::format::any); + static_cast(oshape.ProdShape(1, + oshape.ndim()))}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(output.dtype()), + mkldnn::memory::format::any); } - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, - param.no_bias ? nullptr : &in_data[fullc::kBias], out_md); - auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); - auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); - auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], - ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); + auto data_md = GetMemDesc(*data); + auto weight_md = GetMemDesc(weights); + auto engine = CpuEngine::Get()->get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_forward::desc + ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, bias_md, out_md); + this->fwd_pd.reset(new mkldnn::inner_product_forward::primitive_desc( + ipFwd_desc, engine)); + } else { + mkldnn::inner_product_forward::desc + ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, out_md); + this->fwd_pd.reset(new mkldnn::inner_product_forward::primitive_desc( + ipFwd_desc, engine)); + } + + this->data.reset(new mkldnn::memory(this->fwd_pd->src_primitive_desc())); + this->weights.reset(new mkldnn::memory( + this->fwd_pd->weights_primitive_desc())); + this->out.reset(new mkldnn::memory(this->fwd_pd->dst_primitive_desc())); if (param.no_bias) { - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward( - ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); + this->fwd.reset(new mkldnn::inner_product_forward( + *(this->fwd_pd), *(this->data), *(this->weights), *(this->out))); } else { - auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, - *data_mem, *weight_mem, *bias_mem, *out_mem.second)); + this->bias.reset(new mkldnn::memory(this->fwd_pd->bias_primitive_desc())); + this->fwd.reset(new mkldnn::inner_product_forward(*(this->fwd_pd), + *(this->data), *(this->weights), *(this->bias), *(this->out))); + } +} + +void MKLDNNFullyConnectedFwd::SetDataHandle(const FullyConnectedParam ¶m, + NDArray *data, + const NDArray &weights, + const NDArray *bias, + const NDArray &output, + const OpReqType &req_out) { + const TShape& ishape = data->shape(); + if (data->shape().ndim() != 2 && !param.flatten) { + *data = data->ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); + } else if (data->shape().ndim() != 2) { + *data = data->ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1, + ishape.ndim()))); } - CommitOutput(out_data[fullc::kOut], out_mem); + auto data_mem = data->GetMKLDNNDataReorder(this->fwd_pd->src_primitive_desc()); + auto weight_mem = weights.GetMKLDNNDataReorder( + this->fwd_pd->weights_primitive_desc()); + auto out = CreateMKLDNNMem(output, + this->fwd_pd->dst_primitive_desc(), req_out); + + this->data->set_data_handle(data_mem->get_data_handle()); + this->weights->set_data_handle(weight_mem->get_data_handle()); + if (bias) { + auto bias_mem = bias->GetMKLDNNDataReorder( + this->fwd_pd->bias_primitive_desc()); + this->bias->set_data_handle(bias_mem->get_data_handle()); + } + this->out->set_data_handle(out.second->get_data_handle()); +} + +void MKLDNNFullyConnectedFwd::Execute(const NDArray &output, + const OpReqType &req_out) { + MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); + auto out = CreateMKLDNNMem(output, + this->fwd_pd->dst_primitive_desc(), req_out); + CommitOutput(output, out); MKLDNNStream::Get()->Submit(); } -void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { +static MKLDNNFullyConnectedFwd +&GetFullyConnectedFwd(const FullyConnectedParam ¶m, const OpContext &ctx, + NDArray *data, const NDArray &weights, + const NDArray *bias, const NDArray &output, + const OpReqType &req_out) { + static thread_local std::unordered_map fc_fwds; + MKLDNNFullyConnectedSignature key(param); + key.AddSign(ctx.is_train); + key.AddSign(req_out); + key.AddSign(*data); + key.AddSign(weights); + key.AddSign(output); + if (bias) { + key.AddSign(*bias); + } + + auto it = fc_fwds.find(key); + if (it == fc_fwds.end()) { + MKLDNNFullyConnectedFwd fwd(param, data, weights, bias, output, req_out); + auto ins_ret = fc_fwds.insert(std::pair(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNFullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); + const FullyConnectedParam ¶m + = nnvm::get(attrs.parsed); + auto data = in_data[fullc::kData]; + auto weights = in_data[fullc::kWeight]; + auto output = out_data[fullc::kOut]; + OpReqType req_out = req[fullc::kOut]; + MKLDNNFullyConnectedFwd &fwd = GetFullyConnectedFwd(param, ctx, &data, + weights, param.no_bias ? nullptr : &in_data[fullc::kBias], + output, req_out); + fwd.SetDataHandle(param, &data, weights, + param.no_bias ? nullptr : &in_data[fullc::kBias], + output, req_out); + fwd.Execute(output, req_out); +} + +void MKLDNNFullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); const std::vector &in_grad = outputs; const FullyConnectedParam& param = nnvm::get(attrs.parsed); diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h index 9149cb0c6a94..a54fceddbc9e 100644 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -42,11 +42,13 @@ namespace mxnet { namespace op { /* For fully connected. */ -void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, +void MKLDNNFullyConnectedCompute( + const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data); -void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, +void MKLDNNFullyConnectedGradCompute( + const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs); From 083f0185554a32d16d5a846cf104ce240bbbd4f1 Mon Sep 17 00:00:00 2001 From: Jin Huang Date: Fri, 12 Jan 2018 14:57:23 +0800 Subject: [PATCH 264/264] Fix CreateMKLDNNMem() invoke twice issue --- src/operator/nn/mkldnn/mkldnn_fully_connected.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc index 42477fe89c2e..d396276719e8 100644 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -98,7 +98,7 @@ class MKLDNNFullyConnectedFwd { const NDArray *bias, const NDArray &output, const OpReqType &req_out); - void Execute(const NDArray &output, const OpReqType &req_out); + void Execute(const NDArray &output); private: void _Init(const FullyConnectedParam ¶m, NDArray *data, @@ -111,6 +111,7 @@ class MKLDNNFullyConnectedFwd { std::shared_ptr weights; std::shared_ptr bias; std::shared_ptr out; + OutDataOp data_op; }; void MKLDNNFullyConnectedFwd::_Init(const FullyConnectedParam ¶m, @@ -200,14 +201,12 @@ void MKLDNNFullyConnectedFwd::SetDataHandle(const FullyConnectedParam ¶m, this->bias->set_data_handle(bias_mem->get_data_handle()); } this->out->set_data_handle(out.second->get_data_handle()); + this->data_op = out.first; } -void MKLDNNFullyConnectedFwd::Execute(const NDArray &output, - const OpReqType &req_out) { +void MKLDNNFullyConnectedFwd::Execute(const NDArray &output) { MKLDNNStream::Get()->RegisterPrim(*(this->fwd)); - auto out = CreateMKLDNNMem(output, - this->fwd_pd->dst_primitive_desc(), req_out); - CommitOutput(output, out); + CommitOutput(output, mkldnn_output_t(this->data_op, this->out.get())); MKLDNNStream::Get()->Submit(); } @@ -258,7 +257,7 @@ void MKLDNNFullyConnectedCompute(const nnvm::NodeAttrs& attrs, fwd.SetDataHandle(param, &data, weights, param.no_bias ? nullptr : &in_data[fullc::kBias], output, req_out); - fwd.Execute(output, req_out); + fwd.Execute(output); } void MKLDNNFullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,