Skip to content

Commit

Permalink
Add MKLDNN convolution.
Browse files Browse the repository at this point in the history
  • Loading branch information
zheng-da committed Dec 17, 2017
1 parent 79c563c commit 2f5ed28
Show file tree
Hide file tree
Showing 3 changed files with 397 additions and 49 deletions.
185 changes: 136 additions & 49 deletions src/operator/nn/convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@

#include "./convolution-inl.h"
#include "../elemwise_op_common.h"
#if MXNET_USE_MKL2017 == 1
#include <mkl_memory.h>
#include "../mkl/mkl_memory-inl.h"
#include "../mkl/mkl_convolution-inl.h"
#endif // MXNET_USE_MKL2017
#include "./mkldnn/mkldnn_ops-inl.h"
#if MXNET_USE_NNPACK == 1
#include "./nnpack/nnpack_convolution-inl.h"
#endif // MXNET_USE_NNPACK
Expand All @@ -51,6 +47,46 @@ static inline std::vector<std::string> ListArguments(const ConvolutionParam& par
}
}

static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
#if MXNET_USE_MKLDNN == 1
switch (inputs[0].dtype()) {
case mshadow::kFloat32:
MKLDNNConvolution_Forward(attrs, ctx, inputs, req, outputs);
return;
}
#endif
// TODO I need to convert format.
std::vector<TBlob> in_blobs(inputs.size());
for (size_t i = 0; i < in_blobs.size(); i++)
in_blobs[i] = inputs[i].data();
std::vector<TBlob> out_blobs(outputs.size());
for (size_t i = 0; i < out_blobs.size(); i++)
out_blobs[i] = outputs[i].data();
ConvolutionCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);

This comment has been minimized.

Copy link
@TaoLv

TaoLv Jan 7, 2018

Hi Da, I am a little confused here. When the inputs or paramters are not supported by MKLDNN, the computation will run into the original implementation here. But the output of the convolution here is with MKLDNNStorage type, and the original implementation will do computation on the tblob buffer. I am afraid the following layer will get an incorrect input.

This comment has been minimized.

Copy link
@zheng-da

zheng-da Jan 7, 2018

Author Owner

not really. MKLDNNStorage has multiple layout. If it's the default layout, it just generates TBlob fine. If it's not, NDArray will convert itself to the default layout, it still works.

MKLDNNStorage no longer exists. There isn't a special storage type for MKLDNN. MKLDNN also uses DefaultStorage, so now we can convert the layout in an NDArray freely.

This comment has been minimized.

Copy link
@TaoLv

TaoLv Jan 8, 2018

So there are still two buffers in NDArray structure, one is for dense memory, the other is for mkldnn memory, right? If so, how can layers/ops decide which buffer it should use? To my understanding of the current implementation, if the NDArray has a DefaultStorage type, ops will do computation on the dense memory (tblob), or the NDArray has a MKLDNNStorage type, ops will do computation on the mkldnn memory (Mkl_mem_). How to handle this situation after MKLDNNStorage type is moved?

This comment has been minimized.

Copy link
@TaoLv

TaoLv Jan 8, 2018

I see. mxnet will do reorder in SetTBlob when call outputs[i].data() and return the buffer in Mkl_mem_. Thanks.

}

static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
#if MXNET_USE_MKLDNN == 1
switch (inputs[0].dtype()) {
case mshadow::kFloat32:
MKLDNNConvolution_Backward(attrs, ctx, inputs, req, outputs);
return;
}
#endif
// TODO I need to convert format.
std::vector<TBlob> in_blobs(inputs.size());
for (size_t i = 0; i < in_blobs.size(); i++)
in_blobs[i] = inputs[i].data();
std::vector<TBlob> out_blobs(outputs.size());
for (size_t i = 0; i < out_blobs.size(); i++)
out_blobs[i] = outputs[i].data();
ConvolutionGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
}

static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
std::vector<TShape> *in_shape,
std::vector<TShape> *out_shape) {
Expand All @@ -67,50 +103,50 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
if (dshp.ndim() == 0) return false;

if (param_.kernel.ndim() == 1) {
// 1d conv
CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
param_.kernel[0]);
wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
wshape[0] *= param_.num_group;
SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
if (!param_.no_bias) {
SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
}
// 1d conv
CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
param_.kernel[0]);
wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
wshape[0] *= param_.num_group;
SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
if (!param_.no_bias) {
SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
}

const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
CHECK_EQ(dshape[1] % param_.num_group, 0U) \
<< "input num_filter must divide group size";
CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
<< "output num_filter must divide group size";
CHECK_GT(param_.kernel.Size(), 0U) \
<< "incorrect kernel size: " << param_.kernel;
CHECK_GT(param_.stride.Size(), 0U) \
<< "incorrect stride size: " << param_.stride;
CHECK_GT(param_.dilate.Size(), 0U) \
<< "incorrect dilate size: " << param_.dilate;
Shape<3> oshape;
oshape[0] = dshape[0];
oshape[1] = param_.num_filter;
oshape[2] = dshape[2] ?
(AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
// Perform incomplete shape inference. Fill in the missing values in data shape.
// 1) We can always fill in the batch_size.
// 2) We can back-calculate the input height/width if the corresponding stride is 1.
oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
dshape[0] = oshape[0];
if (oshape[2] && param_.stride[0] == 1) {
dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
}
SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
ConvertLayout(dshape, kNCW, param_.layout.value()));
// Check whether the kernel sizes are valid
if (dshape[2] != 0) {
CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
}
return true;
const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
CHECK_EQ(dshape[1] % param_.num_group, 0U) \
<< "input num_filter must divide group size";
CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
<< "output num_filter must divide group size";
CHECK_GT(param_.kernel.Size(), 0U) \
<< "incorrect kernel size: " << param_.kernel;
CHECK_GT(param_.stride.Size(), 0U) \
<< "incorrect stride size: " << param_.stride;
CHECK_GT(param_.dilate.Size(), 0U) \
<< "incorrect dilate size: " << param_.dilate;
Shape<3> oshape;
oshape[0] = dshape[0];
oshape[1] = param_.num_filter;
oshape[2] = dshape[2] ?
(AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
// Perform incomplete shape inference. Fill in the missing values in data shape.
// 1) We can always fill in the batch_size.
// 2) We can back-calculate the input height/width if the corresponding stride is 1.
oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
dshape[0] = oshape[0];
if (oshape[2] && param_.stride[0] == 1) {
dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
}
SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
ConvertLayout(dshape, kNCW, param_.layout.value()));
// Check whether the kernel sizes are valid
if (dshape[2] != 0) {
CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
}
return true;
} else if (param_.kernel.ndim() == 2) {
// 2d conv
CHECK_EQ(dshp.ndim(), 4U) \
Expand Down Expand Up @@ -259,6 +295,53 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
return true;
}

inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
uint32_t in_expected = param.no_bias ? 2 : 3;
CHECK_EQ(in_attrs->size(), in_expected);
CHECK_EQ(out_attrs->size(), 1);

#if MXNET_USE_MKLDNN == 1
if (dev_mask == mshadow::cpu::kDevMask) {
*dispatch_mode = DispatchMode::kFComputeEx;
(*out_attrs)[0] = kMKLDNNStorage;
return true;
}
#endif
*dispatch_mode = DispatchMode::kFCompute;
(*out_attrs)[0] = kDefaultStorage;
return true;
}

inline static bool backward_ConvStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
uint32_t in_expected = param.no_bias ? 3 : 4;
uint32_t out_expected = param.no_bias ? 2 : 3;
CHECK_EQ(in_attrs->size(), in_expected);
CHECK_EQ(out_attrs->size(), out_expected);

#if MXNET_USE_MKLDNN == 1
if (dev_mask == mshadow::cpu::kDevMask) {
*dispatch_mode = DispatchMode::kFComputeEx;
for (size_t i = 0; i < out_attrs->size(); i++)
(*out_attrs)[i] = kMKLDNNStorage;
return true;
}
#endif
*dispatch_mode = DispatchMode::kFCompute;
for (size_t i = 0; i < out_attrs->size(); i++)
(*out_attrs)[i] = kDefaultStorage;
return true;
}

static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
using namespace mshadow;
ConvolutionParam param_;
Expand Down Expand Up @@ -400,7 +483,9 @@ There are other options to tune the performance.
})
.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
.set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
.set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionCompute_CPU)
.set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
Expand All @@ -416,11 +501,13 @@ NNVM_REGISTER_OP(_backward_Convolution)
return params.no_bias ? 2 : 3;
})
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<FInferStorageType>("FInferStorageType", backward_ConvStorageType)
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
})
.set_attr_parser(ConvolutionParamParser)
.set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
.set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>)
.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradCompute_CPU);

} // namespace op
} // namespace mxnet
Loading

0 comments on commit 2f5ed28

Please sign in to comment.