Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

handle fallback correctly for write inplace when the array is MKLDNN. #10651

Merged
merged 5 commits into from
May 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/common/exec_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
}

inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
const std::vector<OpReqType> &req,
const std::vector<NDArray> *bufs,
std::vector<OpReqType> *req,
std::vector<TBlob> *blobs,
std::vector<NDArray> *temp_src,
std::vector<NDArray> *temp_dst) {
Expand All @@ -86,6 +86,12 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
auto& nd = src[i];
bool is_default = nd.storage_type() == kDefaultStorage;
#if MXNET_USE_MKLDNN == 1
if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
// If it's write inplace and the output array doesn't use the default
// layout, we'll generate a temporary output array below, which means
// the input array and the output array are no longer the same array.
// we should change the request type.
req->at(i) = kWriteTo;
// We have to make sure it's default storage and default layout.
is_default = nd.IsDefaultData();
#endif
Expand Down Expand Up @@ -115,9 +121,9 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
*/
inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
const std::vector<NDArray> &ndoutputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> *in_bufs,
const std::vector<NDArray> *out_bufs,
std::vector<OpReqType> *req,
std::vector<TBlob> *input_blobs,
std::vector<TBlob> *output_blobs,
std::vector<NDArray> *pre_temp_src,
Expand All @@ -130,7 +136,7 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
in_temp_idx_map);
// populate output blobs
SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst,
post_temp_src);
// add mutable inputs to post temp list
for (const auto idx : mutate_idx) {
Expand Down
7 changes: 6 additions & 1 deletion src/executor/attach_op_execs_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
pre_temp_src_.clear(); pre_temp_dst_.clear();
post_temp_src_.clear(); post_temp_dst_.clear();
in_temp_idx_map_.clear();
SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
tmp_req = req;
SetupDefaultBlobsInOut(in_array, out_array, &pre_temp_buf_, &post_temp_buf_, &req,
&in_data_, &out_data_,
&pre_temp_src_, &pre_temp_dst_,
&post_temp_src_, &post_temp_dst_,
Expand All @@ -89,8 +90,12 @@ class StorageFallbackOpExecutor : public OpExecutor {
// storage fallback after fcompute is completed
void PostFCompute(bool is_gpu) {
common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
req = tmp_req;
}

// output requirement on each output array.
// This temporarily saves the original output requirements.
std::vector<OpReqType> tmp_req;
// default storage tensor blobs for fcompute
std::vector<TBlob> in_data_, out_data_;
// These are NDArray buffers for cast storage.
Expand Down
10 changes: 6 additions & 4 deletions src/imperative/imperative_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,16 +373,17 @@ inline void PushFCompute(const FCompute& fn,
#if MXNET_USE_MKLDNN == 1
InvalidateOutputs(outputs, req);
#endif
std::vector<OpReqType> tmp_req = req;
// setup blobs
SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
&input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
&post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
// setup context
OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
// pre-fcompute fallback, cast to default storage type
CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
fn(attrs, opctx, input_blobs, req, output_blobs);
fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
// post-fcompute fallback, cast to original storage type
CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
if (is_gpu) {
Expand Down Expand Up @@ -492,15 +493,16 @@ inline void PushOperator(const OpStatePtr& state,
#if MXNET_USE_MKLDNN == 1
InvalidateOutputs(outputs, req);
#endif
std::vector<OpReqType> tmp_req = req;
// populate input blobs and output blobs
SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
&input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
&post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
// setup contexts
bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
// pre-fcompute fallback
CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
fcompute(state, opctx, input_blobs, req, output_blobs);
fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
// post-fcompute fallback, cast to original storage type, if necessary
CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
if (is_gpu && exec_type == ExecType::kSync) {
Expand Down
5 changes: 2 additions & 3 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1118,9 +1118,8 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
to_mem->get_primitive_desc().get_size());
memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
} else {
std::vector<mkldnn::primitive> net;
net.push_back(mkldnn::reorder(*from_mem, *to_mem));
mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
const_cast<NDArray &>(to).CopyFrom(*from_mem);
MKLDNNStream::Get()->Submit();
}
} else {
// In this case, one of the NDArray isn't supported by MKLDNN, we need
Expand Down
8 changes: 7 additions & 1 deletion src/operator/nn/mkldnn/mkldnn_copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
const NDArray &in_data, const OpReqType &req,
const NDArray &out_data) {
TmpMemMgr::Get()->Init(ctx.requested[0]);
auto in_mem = in_data.GetMKLDNNData();

// If the input data is a view of an MKLDNN array, we should create a new
// NDArray with reordered data.
NDArray data = in_data;
if (data.IsMKLDNNData() && data.IsView())
data = data.Reorder2Default();
auto in_mem = data.GetMKLDNNData();
if (req == kAddTo) {
TmpMemMgr::Get()->Init(ctx.requested[0]);
// We should try and force the output memory has the same format
Expand Down
192 changes: 187 additions & 5 deletions tests/cpp/operator/mkldnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#if MXNET_USE_MKLDNN == 1

#include "gtest/gtest.h"
#include "mxnet/imperative.h"
#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"

using namespace mxnet;
Expand Down Expand Up @@ -97,12 +98,18 @@ static void InitArray(NDArray *arr) {
}

// Init arrays with the specified layout.
static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd) {
static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd,
bool is_rand = false) {
const TBlob &blob = arr->data();
mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
size_t size = blob.Size();
for (size_t i = 0; i < size; i++)
data[i] = i;
if (is_rand) {
for (size_t i = 0; i < size; i++)
data[i] = std::rand();
} else {
for (size_t i = 0; i < size; i++)
data[i] = i;
}
arr->MKLDNNDataReorderAsync(pd);
arr->WaitToRead();
}
Expand Down Expand Up @@ -206,7 +213,7 @@ static std::vector<mkldnn::memory::format> GetMKLDNNFormat(size_t num_dims, int
}

struct TestArrayShapes {
std::vector<TShape> shapes;
std::vector<nnvm::TShape> shapes;
std::vector<mkldnn::memory::primitive_desc> pds;
};

Expand Down Expand Up @@ -239,7 +246,7 @@ static TestArrayShapes GetTestArrayShapes() {
{
// 4D
TShape s1(4);
s1[0] = 1; s1[1] = 96; s1[2] = 54; s1[3] = 54;
s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
shapes.push_back(s1);
pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));

Expand Down Expand Up @@ -332,4 +339,179 @@ TEST(MKLDNN_NDArray, GetDataReorder) {
}
}

struct OpAttrs {
nnvm::NodeAttrs attrs;
std::vector<DispatchMode> dispatches;
};

OpAttrs GetCopyOp() {
OpAttrs attrs;
attrs.attrs.op = Op::Get("_copy");
attrs.dispatches.resize(2);
attrs.dispatches[0] = DispatchMode::kFCompute;
attrs.dispatches[1] = DispatchMode::kFComputeEx;
return attrs;
}

OpAttrs GetLeakyReluOp() {
OpAttrs attrs;
attrs.attrs.op = Op::Get("LeakyReLU");
attrs.dispatches.resize(1);
attrs.dispatches[0] = DispatchMode::kFCompute;
return attrs;
}

/*
* We want to get a few types of NDArrays for testing:
* 1. Normal NDArray
* 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
* 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
* dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
* type of NDArrays only exists for weight arrays. I don't think we should
* pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
* 4. Reshaped/sliced NDArray
* 5. Reshaped/sliced NDArray with MKLDNN layout (reshape/slice from Normal NDArray
* with MKLDNN layout)
* 6. Reshaped/sliced NDArray with MKLDNN layout whose MKLDNN memory may have
* different dimensions from the NDArray (result of MKLDNNDataReorderAsync).
* However, this type of NDArrays only exists for weight arrays. I don't think
* we should pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
*
*/
std::vector<NDArray> GetTestInputArrays() {
TestArrayShapes tas = GetTestArrayShapes();
std::vector<nnvm::TShape> shapes = tas.shapes;
std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;

std::vector<NDArray> in_arrs;
for (auto shape : shapes) {
in_arrs.emplace_back(shape, Context());
InitArray(&in_arrs.back());
for (auto pd : pds) {
if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
continue;

in_arrs.emplace_back(shape, Context());
InitMKLDNNArray(&in_arrs.back(), pd);

// Get a sliced version.
NDArray arr(shape, Context());
InitMKLDNNArray(&arr, pd);
arr = arr.Slice(1, arr.shape()[0] - 1);
in_arrs.emplace_back(arr);
}
}
return in_arrs;
}

/*
* We want to get a few types of NDArrays for testing:
* 1. Normal NDArray
* 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
* 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
* dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
* type of NDArrays only exists for weight arrays. I don't think we should
* pass them to all operators.
* In the inference mode, the MKLDNN memory in the weight array will be
* reordered to 5 dimensions.
* 4. Reused NDArray (this is created by the MXNet executor). This type of
* NDArrays can only be used as output arrays.
*/
std::vector<NDArray> GetTestOutputArrays(const TShape &shape,
const std::vector<mkldnn::memory::primitive_desc> &pds) {
std::vector<NDArray> in_arrs;
in_arrs.emplace_back(shape, Context());
InitArray(&in_arrs.back());

// Get a reused version.
nnvm::TShape s(1);
s[0] = shape.Size();
NDArray arr(s, Context());
arr = arr.AsArray(shape, arr.dtype());
InitArray(&arr);
in_arrs.emplace_back(arr);

for (auto pd : pds) {
if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
continue;

in_arrs.emplace_back(shape, Context());
InitMKLDNNArray(&in_arrs.back(), pd, true);

// Get a reused version.
nnvm::TShape s(1);
s[0] = shape.Size();
arr = NDArray(s, Context());
arr = arr.AsArray(shape, arr.dtype());
InitMKLDNNArray(&arr, pd, true);
in_arrs.emplace_back(arr);
}
return in_arrs;
}

using VerifyFunc = std::function<void (const NDArray &in_arr, const NDArray &arr)>;

void VerifyCopyResult(const NDArray &in_arr, const NDArray &arr) {
NDArray tmp1 = in_arr.Reorder2Default();
NDArray tmp2 = arr.Reorder2Default();
EXPECT_EQ(tmp1.shape().Size(), tmp2.shape().Size());
TBlob d1 = tmp1.data();
TBlob d2 = tmp2.data();
EXPECT_EQ(memcmp(d1.dptr_, d2.dptr_,
tmp1.shape().Size() * sizeof(mshadow::default_real_t)), 0);
}

void TestUnaryOp(const OpAttrs &attrs, VerifyFunc verify_fn) {
std::vector<NDArray*> inputs(1);
std::vector<NDArray*> outputs(1);
std::vector<OpReqType> req(1);
std::vector<DispatchMode> dispatches = attrs.dispatches;

TestArrayShapes tas = GetTestArrayShapes();
std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;

std::vector<NDArray> in_arrs = GetTestInputArrays();
for (auto in_arr : in_arrs) {
for (auto dispatch : dispatches) {
std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr.shape(), pds);
for (auto out_arr : out_arrs) {
req[0] = kWriteTo;
inputs[0] = &in_arr;
outputs[0] = &out_arr;
Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs,
outputs, req, dispatch, mxnet::OpStatePtr());
out_arr.WaitToRead();
verify_fn(in_arr, out_arr);
}
}
}

for (auto dispatch : dispatches) {
in_arrs = GetTestInputArrays();
for (auto arr : in_arrs) {
// If the array is a view, we shouldn't write data to it.
if (arr.IsView())
continue;

NDArray orig = arr.Copy(arr.ctx());
req[0] = kWriteInplace;
inputs[0] = &arr;
outputs[0] = &arr;
Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs, outputs, req,
dispatch, mxnet::OpStatePtr());
arr.WaitToRead();
verify_fn(orig, arr);
}
}
}

TEST(IMPERATIVE, UnaryOp) {
OpAttrs attrs = GetCopyOp();
TestUnaryOp(attrs, VerifyCopyResult);
}

#endif