Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OneDNN md-in-tensor refactoring part 5: Memory descriptor enabled for elementwises, reductions and expand_v2 ops #43036

Merged
merged 5 commits into from
May 30, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
binary_prim->execute(astream, args);
astream.wait();

z->set_layout(DataLayout::kMKLDNN);
z->set_format(platform::GetMKLDNNFormat(*dst_memory));
z->set_mem_desc(dst_memory->get_desc());
}
};

Expand Down Expand Up @@ -179,7 +178,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
onednn_engine);

auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
dout->format(), platform::to_void_cast(dout->data<T>()));
dout->mem_desc(), platform::to_void_cast(dout->data<T>()));

auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();

Expand All @@ -189,7 +188,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
// elementwise_add & elementwise_sub
if (BINARY_OP == dnnl::algorithm::binary_add ||
BINARY_OP == dnnl::algorithm::binary_sub) {
dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
dst_memory = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(),
ctx.GetPlace());
auto reorder_p =
reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
Expand Down Expand Up @@ -218,8 +217,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
}
astream.wait();

dx->set_layout(framework::DataLayout::kMKLDNN);
dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
dx->set_mem_desc(dst_memory->get_desc());
}

if (dy) {
Expand All @@ -232,7 +230,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
BINARY_OP == dnnl::algorithm::binary_sub) {
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
dy, dout->format(), ctx.GetPlace());
dy, dout->mem_desc(), ctx.GetPlace());

dnnl::primitive_attr reorder_attr;
std::vector<float> scales(1);
Expand Down Expand Up @@ -301,7 +299,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
dst_memory = dst_dy_memory;
}
astream.wait();
dy->set_layout(DataLayout::kMKLDNN);

if (dout->dims() != dy->dims()) {
// Broadcasting
Expand All @@ -324,10 +321,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
{DNNL_ARG_DST, *dst_memory},
});
astream.wait();
dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims()))));
dy->set_mem_desc(dst_memory->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims())));
} else {
dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
dy->set_mem_desc(dst_memory->get_desc());
}
}
}
Expand Down
21 changes: 9 additions & 12 deletions paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,17 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
}

dnnl::memory::desc x_mem_desc = x->mem_desc();
if (x_vec_dims.size() != out_new_dims.size()) {
x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims,
out_new_dims.size());
x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
}

out->Resize(phi::make_ddim(out_new_dims));
paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x,
0.0f, 1.0f, x_mem_desc);
dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), x, out,
0.0f, 1.0f, x_vec_dims);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = handler.AcquireDstMemory(out); // acquires zeroed mem
auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
auto binary_p = handler.AcquireForwardPrimitive();

const std::unordered_map<int, dnnl::memory> args = {
Expand All @@ -73,14 +71,13 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
}

private:
dnnl::memory::desc GetExtendedMemoryDescriptor(
const dnnl::memory::desc& x_mem_desc,
const std::vector<int64_t>& x_vec_dims, int new_size) const {
std::vector<int64_t> new_dims(new_size, 1);
std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
int new_size) const {
std::vector<int64_t> extended_x_dims(new_size, 1);
std::copy(x_vec_dims.begin(), x_vec_dims.end(),
new_dims.begin() + new_size - x_vec_dims.size());
extended_x_dims.begin() + new_size - x_vec_dims.size());

return x_mem_desc.reshape(new_dims);
return extended_x_dims;
}
};

Expand Down
69 changes: 32 additions & 37 deletions paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ inline std::vector<int64_t> CalculateReducedDims(
bool reduce_all, bool keep_dim) {
if (keep_dim) return phi::vectorize(output->dims());

if (reduce_all)
return std::vector<int64_t>(phi::vectorize(input->dims()).size(), 1);
if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1);

std::vector<int64_t> output_dims(phi::vectorize(input->dims()));
std::vector<int64_t> output_dims = phi::vectorize(input->dims());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[QUESTION]
Was this change necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it will produce the exact same assembly, I will revert it then

for (size_t i = 0; i < reduce_dims.size(); ++i) {
// handle negative dims, f.e. -1 means last dimension
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CONCERN]
I think last dimension is not the best description.
(1, 3, -1, 512, 512) << the last dimension is -1 or 512?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, I'll change it to "rightmost"

reduce_dims[i] = (reduce_dims[i] >= 0)
? reduce_dims[i]
: input->dims().size() + reduce_dims[i];
Expand All @@ -52,51 +52,52 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();

const auto* input = ctx.Input<LoDTensor>("X");
auto* output = ctx.Output<Tensor>("Out");
const auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<Tensor>("Out");

auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
bool reduce_all = ctx.Attr<bool>("reduce_all");
bool keep_dim = ctx.Attr<bool>("keep_dim");

auto output_dims =
CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
auto input_dims = phi::vectorize(input->dims());
auto x_tz = phi::vectorize(x->dims());
auto out_tz =
CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim);
Comment on lines +55 to +64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[OPINION]
I see why you changed names here but input>x IMO

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed them to unify all of our ops, since all new ops are following this pattern, i.e. inputs are described as 'X' and 'Y' and output is described as 'Out'. Some legacy operators are having different names because of backwards compatibility, but I was thinking that unifying them to get rid of the additional noise would be nice


auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();

// oneDNN reduce op does not support edge case in which memory is being
// copied without actual reduction.
// In that case reorder must be executed to maintain compatibility with
// PaddlePaddle reduce op
if (input_dims == output_dims) {
dnnl::memory::data_type input_type = framework::ToMKLDNNDataType(
framework::TransToProtoVarType(input->dtype()));
if (x_tz == out_tz) {
dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(
framework::TransToProtoVarType(x->dtype()));
platform::ReorderMKLDNNHandler reorder_handler(
input_dims, framework::TransToProtoVarType(input->dtype()),
input_type, onednn_engine);
x_tz, framework::TransToProtoVarType(x->dtype()), x_type,
onednn_engine);

auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
input->mem_desc(), platform::to_void_cast(input->data<T>()));
x->mem_desc(), platform::to_void_cast(x->data<T>()));

auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
output, input->mem_desc(), ctx.GetPlace());
// reuse same mem desc since it is a simple copy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// reuse same mem desc since it is a simple copy
// reuse mem desc since it is a simple copy

auto reorder_dst_memory_p =
reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace());

auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
reorder_dst_memory_p);

reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
astream.wait();

output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(output->dims())));
out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(out->dims())));
} else {
platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f,
onednn_engine, ctx.GetPlace(),
input, output, output_dims);
x, out, out_tz);

auto src_memory_p = handler.AcquireSrcMemory(input);
auto dst_memory_p = handler.AcquireDstMemory(output);
auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = handler.AcquireDstMemory(out);

std::unordered_map<int, dnnl::memory> reduction_args = {
{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
Expand All @@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {

reduction_p->execute(astream, reduction_args);
astream.wait();
output->set_mem_desc(dst_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(output->dims())));

out->set_mem_desc(dst_memory_p->get_desc().reshape(
phi::vectorize<int64_t>(out->dims())));
}
}
};
Expand All @@ -124,25 +126,18 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
bool keep_dim = ctx.Attr<bool>("keep_dim");
bool reduce_all = ctx.Attr<bool>("reduce_all");
auto dims = ctx.Attr<std::vector<int>>("dim");
const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CONCERN]
Do you plan to change this pointer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, nice catch, that was a mistake, thanks

auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));

const auto input_dims =
CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
const auto output_dims = phi::vectorize(dx->dims());

auto dout_mem_desc = dout->mem_desc();

if (input_dims != output_dims) {
dout_mem_desc = dout_mem_desc.reshape(input_dims);
}
auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
auto dx_tz = phi::vectorize(dx->dims());
Comment on lines +132 to +133
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CONCERN]
The logic changed here.
Input has been swaped with output, was it intentional?

What means tz and d in this context?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input and Output were incorrectly swapped here earlier. Reduce kernel was one of the first ops that I have made and there were some inconsistencies and bugs inside of it, so that change is intentional

d means that it is a grad tensor, so dout is the input and dx is the output
I have no idea what tz means, but it is commonly used as a synonym for dims inside PaddlePaddle, oneDNN and other frameworks


platform::BroadcastDataMKLDNNHandler<T> handler(
binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y,
dout_mem_desc);
platform::BroadcastDataMKLDNNHandler<T> handler(binary_type, onednn_engine,
ctx.GetPlace(), dout, dx,
scale_x, scale_y, dout_tz);

const auto src_memory_p = handler.AcquireSrcMemory(dout);
const auto dst_memory_p = handler.AcquireDstMemory(dx);
const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx);
const auto binary_prim = handler.AcquireForwardPrimitive();

const std::unordered_map<int, dnnl::memory> args = {
Expand Down
76 changes: 27 additions & 49 deletions paddle/fluid/platform/mkldnn_reuse.h
Original file line number Diff line number Diff line change
Expand Up @@ -616,29 +616,17 @@ class BinaryMKLDNNHandler
public:
BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
const dnnl::engine engine, platform::Place cpu_place,
const Tensor* x, const Tensor* y, Tensor* z,
float scale_x, float scale_y, float scale_z,
const Tensor* x, const Tensor* y, Tensor* out,
float scale_x, float scale_y, float scale_out,
const dnnl::post_ops& post_ops = dnnl::post_ops{})
: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
DataLayout::kMKLDNN, x->layout()));

PADDLE_ENFORCE_EQ(
y->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
DataLayout::kMKLDNN, y->layout()));

const auto src_x_tz = phi::vectorize(x->dims());
const auto src_y_tz = phi::vectorize(y->dims());
// if output tensor(z) is nullptr then we are computing into oneDNN
// managed buffer
auto rankdiff = x->dims().size() - y->dims().size();
const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
: phi::vectorize(z->dims());
const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
: phi::vectorize(out->dims());

auto src0_md = x->mem_desc();
auto src1_md = y->mem_desc();
Expand Down Expand Up @@ -667,7 +655,7 @@ class BinaryMKLDNNHandler
MKLDNNMemoryFormat::any);

auto attributes =
CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);

this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
dst_md);
Expand All @@ -681,7 +669,7 @@ class BinaryMKLDNNHandler

private:
static inline dnnl::primitive_attr CreateAttributes(
dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
dnnl::algorithm op, float scale_x, float scale_y, float scale_out,
dnnl::post_ops post_ops = dnnl::post_ops{}) {
// Scales set in attributes for inputs contibute to the output equation
// in the following way (assuming no broadcasting takes place):
Expand All @@ -699,9 +687,9 @@ class BinaryMKLDNNHandler
// For mul operation on the other hand
// output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
// <scale_0> <scale_1>
float scale_0 = scale_z / scale_x;
float scale_0 = scale_out / scale_x;
float scale_1 =
op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y;
op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
dnnl::primitive_attr attributes;
attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0,
{scale_0});
Expand All @@ -718,34 +706,28 @@ class BroadcastDataMKLDNNHandler
public:
BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
const dnnl::engine engine,
platform::Place cpu_place, const Tensor* out,
const Tensor* x, float scale_x, float scale_y,
const dnnl::memory::desc& x_mem_desc)
platform::Place cpu_place, const Tensor* x,
Tensor* out, float scale_x, float scale_y,
const std::vector<int64_t>& extended_x_dims)
Comment on lines +709 to +711
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CONCERN]
Type is almost the same but order has changed.
I hope that our tests will detect potential problems with it.

Was this change of order necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was not necessary, but for me order: input, output is more intuitive than output, input. I have checked all the tests for reduce_grad and expand_v2 and they all work correctly after my changes

: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument("Wrong layout set for X tensor."));

const auto src0_tz = phi::vectorize(out->dims());

const auto src0_md =
dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(),
platform::GetPlainMKLDNNFormat(src0_tz.size()));
const auto src1_md = x->mem_desc().reshape(extended_x_dims);

const auto src1_md = x_mem_desc;

dnnl::primitive_attr attributes;
attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
dnnl::primitive_attr attrs;
attrs.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
attrs.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[OPINION]
The old name was good.


this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
this->AcquireForwardPrimitiveDescriptor(attrs, algo, src0_md, src1_md,
src0_md);
}

template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) {
T_out* ptr = output->mutable_data<T_out>(
this->place_, this->fwd_pd_->dst_desc().get_size());
std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) {
T_out* ptr = out->mutable_data<T_out>(this->place_,
this->fwd_pd_->dst_desc().get_size());
memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
}
Expand All @@ -758,22 +740,18 @@ class ReductionMKLDNNHandler
ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
const float eps, const dnnl::engine engine,
platform::Place cpu_place, const Tensor* x,
const Tensor* y, std::vector<int64_t> y_tz,
const dnnl::primitive_attr& attr = NULL)
const Tensor* out, std::vector<int64_t> out_tz,
const dnnl::primitive_attr& attrs = NULL)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[COMMENT]
If dnnl::primitive_attr can hold multiple attributes its name should be changed in onednn, but if it can't then our parameter should have the old name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can hold scales, post-ops, zero-points, so it definitely supports multiple attributes, but probably because of backward compatibility it won't be changed inside oneDNN

: platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
cpu_place) {
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument("Wrong layout set for X tensor."));

const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(),
dnnl::memory::format_tag::any);
const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType<T>(),
dnnl::memory::format_tag::any);

if (attr)
this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md,
p, eps);
if (attrs)
this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(),
out_md, p, eps);
else
this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p,
this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p,
eps);
}
};
Expand Down
Loading