-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
OneDNN md-in-tensor refactoring part 5: Memory descriptor enabled for elementwises, reductions and expand_v2 ops #43036
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -29,11 +29,11 @@ inline std::vector<int64_t> CalculateReducedDims( | |||||
bool reduce_all, bool keep_dim) { | ||||||
if (keep_dim) return phi::vectorize(output->dims()); | ||||||
|
||||||
if (reduce_all) | ||||||
return std::vector<int64_t>(phi::vectorize(input->dims()).size(), 1); | ||||||
if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1); | ||||||
|
||||||
std::vector<int64_t> output_dims(phi::vectorize(input->dims())); | ||||||
std::vector<int64_t> output_dims = phi::vectorize(input->dims()); | ||||||
for (size_t i = 0; i < reduce_dims.size(); ++i) { | ||||||
// handle negative dims, f.e. -1 means last dimension | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CONCERN] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, I'll change it to "rightmost" |
||||||
reduce_dims[i] = (reduce_dims[i] >= 0) | ||||||
? reduce_dims[i] | ||||||
: input->dims().size() + reduce_dims[i]; | ||||||
|
@@ -52,51 +52,52 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> { | |||||
ctx.template device_context<platform::MKLDNNDeviceContext>(); | ||||||
const auto& onednn_engine = dev_ctx.GetEngine(); | ||||||
|
||||||
const auto* input = ctx.Input<LoDTensor>("X"); | ||||||
auto* output = ctx.Output<Tensor>("Out"); | ||||||
const auto* x = ctx.Input<LoDTensor>("X"); | ||||||
auto* out = ctx.Output<Tensor>("Out"); | ||||||
|
||||||
auto reduce_dims = ctx.Attr<std::vector<int>>("dim"); | ||||||
bool reduce_all = ctx.Attr<bool>("reduce_all"); | ||||||
bool keep_dim = ctx.Attr<bool>("keep_dim"); | ||||||
|
||||||
auto output_dims = | ||||||
CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim); | ||||||
auto input_dims = phi::vectorize(input->dims()); | ||||||
auto x_tz = phi::vectorize(x->dims()); | ||||||
auto out_tz = | ||||||
CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim); | ||||||
Comment on lines
+55
to
+64
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [OPINION] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have changed them to unify all of our ops, since all new ops are following this pattern, i.e. inputs are described as 'X' and 'Y' and output is described as 'Out'. Some legacy operators are having different names because of backwards compatibility, but I was thinking that unifying them to get rid of the additional noise would be nice |
||||||
|
||||||
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); | ||||||
|
||||||
// oneDNN reduce op does not support edge case in which memory is being | ||||||
// copied without actual reduction. | ||||||
// In that case reorder must be executed to maintain compatibility with | ||||||
// PaddlePaddle reduce op | ||||||
if (input_dims == output_dims) { | ||||||
dnnl::memory::data_type input_type = framework::ToMKLDNNDataType( | ||||||
framework::TransToProtoVarType(input->dtype())); | ||||||
if (x_tz == out_tz) { | ||||||
dnnl::memory::data_type x_type = framework::ToMKLDNNDataType( | ||||||
framework::TransToProtoVarType(x->dtype())); | ||||||
platform::ReorderMKLDNNHandler reorder_handler( | ||||||
input_dims, framework::TransToProtoVarType(input->dtype()), | ||||||
input_type, onednn_engine); | ||||||
x_tz, framework::TransToProtoVarType(x->dtype()), x_type, | ||||||
onednn_engine); | ||||||
|
||||||
auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( | ||||||
input->mem_desc(), platform::to_void_cast(input->data<T>())); | ||||||
x->mem_desc(), platform::to_void_cast(x->data<T>())); | ||||||
|
||||||
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( | ||||||
output, input->mem_desc(), ctx.GetPlace()); | ||||||
// reuse same mem desc since it is a simple copy | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
auto reorder_dst_memory_p = | ||||||
reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace()); | ||||||
|
||||||
auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, | ||||||
reorder_dst_memory_p); | ||||||
|
||||||
reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); | ||||||
astream.wait(); | ||||||
|
||||||
output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape( | ||||||
phi::vectorize<int64_t>(output->dims()))); | ||||||
out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape( | ||||||
phi::vectorize<int64_t>(out->dims()))); | ||||||
} else { | ||||||
platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f, | ||||||
onednn_engine, ctx.GetPlace(), | ||||||
input, output, output_dims); | ||||||
x, out, out_tz); | ||||||
|
||||||
auto src_memory_p = handler.AcquireSrcMemory(input); | ||||||
auto dst_memory_p = handler.AcquireDstMemory(output); | ||||||
auto src_memory_p = handler.AcquireSrcMemory(x); | ||||||
auto dst_memory_p = handler.AcquireDstMemory(out); | ||||||
|
||||||
std::unordered_map<int, dnnl::memory> reduction_args = { | ||||||
{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; | ||||||
|
@@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> { | |||||
|
||||||
reduction_p->execute(astream, reduction_args); | ||||||
astream.wait(); | ||||||
output->set_mem_desc(dst_memory_p->get_desc().reshape( | ||||||
phi::vectorize<int64_t>(output->dims()))); | ||||||
|
||||||
out->set_mem_desc(dst_memory_p->get_desc().reshape( | ||||||
phi::vectorize<int64_t>(out->dims()))); | ||||||
} | ||||||
} | ||||||
}; | ||||||
|
@@ -124,25 +126,18 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> { | |||||
bool keep_dim = ctx.Attr<bool>("keep_dim"); | ||||||
bool reduce_all = ctx.Attr<bool>("reduce_all"); | ||||||
auto dims = ctx.Attr<std::vector<int>>("dim"); | ||||||
const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); | ||||||
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CONCERN] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, nice catch, that was a mistake, thanks |
||||||
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); | ||||||
|
||||||
const auto input_dims = | ||||||
CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim); | ||||||
const auto output_dims = phi::vectorize(dx->dims()); | ||||||
|
||||||
auto dout_mem_desc = dout->mem_desc(); | ||||||
|
||||||
if (input_dims != output_dims) { | ||||||
dout_mem_desc = dout_mem_desc.reshape(input_dims); | ||||||
} | ||||||
auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim); | ||||||
auto dx_tz = phi::vectorize(dx->dims()); | ||||||
Comment on lines
+132
to
+133
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CONCERN] What means There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
|
||||||
platform::BroadcastDataMKLDNNHandler<T> handler( | ||||||
binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y, | ||||||
dout_mem_desc); | ||||||
platform::BroadcastDataMKLDNNHandler<T> handler(binary_type, onednn_engine, | ||||||
ctx.GetPlace(), dout, dx, | ||||||
scale_x, scale_y, dout_tz); | ||||||
|
||||||
const auto src_memory_p = handler.AcquireSrcMemory(dout); | ||||||
const auto dst_memory_p = handler.AcquireDstMemory(dx); | ||||||
const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx); | ||||||
const auto binary_prim = handler.AcquireForwardPrimitive(); | ||||||
|
||||||
const std::unordered_map<int, dnnl::memory> args = { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -616,29 +616,17 @@ class BinaryMKLDNNHandler | |
public: | ||
BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, | ||
const dnnl::engine engine, platform::Place cpu_place, | ||
const Tensor* x, const Tensor* y, Tensor* z, | ||
float scale_x, float scale_y, float scale_z, | ||
const Tensor* x, const Tensor* y, Tensor* out, | ||
float scale_x, float scale_y, float scale_out, | ||
const dnnl::post_ops& post_ops = dnnl::post_ops{}) | ||
: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) { | ||
PADDLE_ENFORCE_EQ( | ||
x->layout(), DataLayout::kMKLDNN, | ||
platform::errors::InvalidArgument( | ||
"Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", | ||
DataLayout::kMKLDNN, x->layout())); | ||
|
||
PADDLE_ENFORCE_EQ( | ||
y->layout(), DataLayout::kMKLDNN, | ||
platform::errors::InvalidArgument( | ||
"Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", | ||
DataLayout::kMKLDNN, y->layout())); | ||
|
||
const auto src_x_tz = phi::vectorize(x->dims()); | ||
const auto src_y_tz = phi::vectorize(y->dims()); | ||
// if output tensor(z) is nullptr then we are computing into oneDNN | ||
// managed buffer | ||
auto rankdiff = x->dims().size() - y->dims().size(); | ||
const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) | ||
: phi::vectorize(z->dims()); | ||
const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) | ||
: phi::vectorize(out->dims()); | ||
|
||
auto src0_md = x->mem_desc(); | ||
auto src1_md = y->mem_desc(); | ||
|
@@ -667,7 +655,7 @@ class BinaryMKLDNNHandler | |
MKLDNNMemoryFormat::any); | ||
|
||
auto attributes = | ||
CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops); | ||
CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); | ||
|
||
this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, | ||
dst_md); | ||
|
@@ -681,7 +669,7 @@ class BinaryMKLDNNHandler | |
|
||
private: | ||
static inline dnnl::primitive_attr CreateAttributes( | ||
dnnl::algorithm op, float scale_x, float scale_y, float scale_z, | ||
dnnl::algorithm op, float scale_x, float scale_y, float scale_out, | ||
dnnl::post_ops post_ops = dnnl::post_ops{}) { | ||
// Scales set in attributes for inputs contibute to the output equation | ||
// in the following way (assuming no broadcasting takes place): | ||
|
@@ -699,9 +687,9 @@ class BinaryMKLDNNHandler | |
// For mul operation on the other hand | ||
// output = (scale_out / scale_x) * x * (1.0 / scale_y) * y | ||
// <scale_0> <scale_1> | ||
float scale_0 = scale_z / scale_x; | ||
float scale_0 = scale_out / scale_x; | ||
float scale_1 = | ||
op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y; | ||
op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y; | ||
dnnl::primitive_attr attributes; | ||
attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, | ||
{scale_0}); | ||
|
@@ -718,34 +706,28 @@ class BroadcastDataMKLDNNHandler | |
public: | ||
BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, | ||
const dnnl::engine engine, | ||
platform::Place cpu_place, const Tensor* out, | ||
const Tensor* x, float scale_x, float scale_y, | ||
const dnnl::memory::desc& x_mem_desc) | ||
platform::Place cpu_place, const Tensor* x, | ||
Tensor* out, float scale_x, float scale_y, | ||
const std::vector<int64_t>& extended_x_dims) | ||
Comment on lines
+709
to
+711
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CONCERN] Was this change of order necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was not necessary, but for me order: input, output is more intuitive than output, input. I have checked all the tests for reduce_grad and expand_v2 and they all work correctly after my changes |
||
: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) { | ||
PADDLE_ENFORCE_EQ( | ||
x->layout(), DataLayout::kMKLDNN, | ||
platform::errors::InvalidArgument("Wrong layout set for X tensor.")); | ||
|
||
const auto src0_tz = phi::vectorize(out->dims()); | ||
|
||
const auto src0_md = | ||
dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(), | ||
platform::GetPlainMKLDNNFormat(src0_tz.size())); | ||
const auto src1_md = x->mem_desc().reshape(extended_x_dims); | ||
|
||
const auto src1_md = x_mem_desc; | ||
|
||
dnnl::primitive_attr attributes; | ||
attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); | ||
attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); | ||
dnnl::primitive_attr attrs; | ||
attrs.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); | ||
attrs.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [OPINION] |
||
|
||
this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, | ||
this->AcquireForwardPrimitiveDescriptor(attrs, algo, src0_md, src1_md, | ||
src0_md); | ||
} | ||
|
||
template <typename T_out = T> | ||
std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) { | ||
T_out* ptr = output->mutable_data<T_out>( | ||
this->place_, this->fwd_pd_->dst_desc().get_size()); | ||
std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) { | ||
T_out* ptr = out->mutable_data<T_out>(this->place_, | ||
this->fwd_pd_->dst_desc().get_size()); | ||
memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); | ||
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); | ||
} | ||
|
@@ -758,22 +740,18 @@ class ReductionMKLDNNHandler | |
ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p, | ||
const float eps, const dnnl::engine engine, | ||
platform::Place cpu_place, const Tensor* x, | ||
const Tensor* y, std::vector<int64_t> y_tz, | ||
const dnnl::primitive_attr& attr = NULL) | ||
const Tensor* out, std::vector<int64_t> out_tz, | ||
const dnnl::primitive_attr& attrs = NULL) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [COMMENT] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It can hold scales, post-ops, zero-points, so it definitely supports multiple attributes, but probably because of backward compatibility it won't be changed inside oneDNN |
||
: platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine, | ||
cpu_place) { | ||
PADDLE_ENFORCE_EQ( | ||
x->layout(), DataLayout::kMKLDNN, | ||
platform::errors::InvalidArgument("Wrong layout set for X tensor.")); | ||
|
||
const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), | ||
dnnl::memory::format_tag::any); | ||
const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType<T>(), | ||
dnnl::memory::format_tag::any); | ||
|
||
if (attr) | ||
this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md, | ||
p, eps); | ||
if (attrs) | ||
this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(), | ||
out_md, p, eps); | ||
else | ||
this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p, | ||
this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p, | ||
eps); | ||
} | ||
}; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[QUESTION]
Was this change necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it will produce the exact same assembly, I will revert it then