Skip to content

Commit

Permalink
src: gpu: intel: jit: conv: add reorder-based precomputed zero points
Browse files Browse the repository at this point in the history
  • Loading branch information
hidefromkgb committed Dec 17, 2024
1 parent 448cb36 commit 2a37b6b
Show file tree
Hide file tree
Showing 31 changed files with 552 additions and 199 deletions.
2 changes: 0 additions & 2 deletions src/common/memory_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,6 @@ status_t memory_desc_permute_axes(memory_desc_t &out_memory_desc,
VCHECK_MEMORY(
!memory_desc_wrapper(in_memory_desc).has_runtime_dims_or_strides(),
invalid_arguments, VERBOSE_UNSUPPORTED_MEM_STRIDE);
VCHECK_MEMORY(in_memory_desc.extra.flags == 0, invalid_arguments,
VERBOSE_UNSUPPORTED_MD_FLAG, "extra");

// verify that perm is indeed a permutation of [0 .. ndims)
unsigned occurrence_mask = 0;
Expand Down
25 changes: 24 additions & 1 deletion src/common/memory_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ enum memory_extra_flags_t {
= dnnl_memory_extra_flag_rnn_u8s8_compensation,
dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u,
dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u,
dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src = 32u,
dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd = 64u,
dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap = 128u,
};

// Create aliases for extra flags to preserve the old behavior.
Expand All @@ -87,6 +90,14 @@ const memory_extra_flags_t rnn_s8s8_compensation
= dnnl_memory_extra_flag_rnn_s8s8_compensation;
const memory_extra_flags_t compensation_conv_asymmetric_src
= dnnl_memory_extra_flag_compensation_conv_asymmetric_src;
// *_conv_asymmetric_src output differs significantly on CPU and GPU so there's
// need for a separate flag on GPU so that reorders could not mess these two up
const memory_extra_flags_t compensation_gpu_conv_asymmetric_src
= dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src;
const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_bwd
= dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd;
const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_swap
= dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap;
} // namespace memory_extra_flags

// Generic description of blocked data layout for most memory formats.
Expand Down Expand Up @@ -208,7 +219,11 @@ struct memory_extra_desc_t {
: flags(0)
, compensation_mask(0)
, scale_adjust(0.0f)
, asymm_compensation_mask(0) {}
, asymm_compensation_mask(0)
, idhw {0, 0, 0}
, odhw {0, 0, 0}
, pdhw {0, 0, 0}
, ddhw {0, 0, 0} {}
// The flags contain arbitrary extra information, such as compensation.
// @sa dnnl_memory_extra_flags_t
uint64_t flags;
Expand All @@ -218,6 +233,14 @@ struct memory_extra_desc_t {
float scale_adjust;
// Compensation mask for asymmetric quantization
int asymm_compensation_mask;
// Precomp GPU ZP convolution input spatials
dim_t idhw[3];
// Precomp GPU ZP convolution output spatials
dim_t odhw[3];
// Precomp GPU ZP convolution padding spatials
dim_t pdhw[3];
// Precomp GPU ZP convolution dilation spatials
dim_t ddhw[3];
};

status_t DNNL_API memory_desc_init_by_tag(memory_desc_t &memory_desc, int ndims,
Expand Down
16 changes: 16 additions & 0 deletions src/common/memory_desc_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ struct memory_desc_wrapper : public c_compatible {
if (flag_select & rnn_u8s8_compensation) return sizeof(float);
if (flag_select & compensation_conv_asymmetric_src)
return sizeof(int32_t);
if (flag_select & compensation_gpu_conv_asymmetric_src)
return sizeof(int32_t);
return 0;
}

Expand All @@ -160,6 +162,7 @@ struct memory_desc_wrapper : public c_compatible {
using namespace memory_extra_flags;
return extra().flags
& (compensation_conv_s8s8 | rnn_u8s8_compensation
| compensation_gpu_conv_asymmetric_src
| compensation_conv_asymmetric_src);
}

Expand Down Expand Up @@ -193,6 +196,17 @@ struct memory_desc_wrapper : public c_compatible {
return calculate_size(extra().asymm_compensation_mask,
additional_buffer_data_size(flag));
}
if (flag == compensation_gpu_conv_asymmetric_src) {
dim_t dhw = 1;
for (const auto &o : extra().odhw)
dhw *= std::max(o, dim_t(1));
const int off = (!extra().odhw[1]) ? !extra().odhw[2] + 2
: !extra().odhw[0];
const bool with_groups = (ndims == (6 - off));
return dhw
* calculate_size((with_groups) ? 3 : 1,
additional_buffer_data_size(flag));
}

return 0;
}
Expand All @@ -212,6 +226,8 @@ struct memory_desc_wrapper : public c_compatible {
buff_size += additional_buffer_size(compensation_conv_s8s8);
buff_size += additional_buffer_size(rnn_u8s8_compensation);
buff_size += additional_buffer_size(compensation_conv_asymmetric_src);
buff_size
+= additional_buffer_size(compensation_gpu_conv_asymmetric_src);
return buff_size;
}

Expand Down
8 changes: 8 additions & 0 deletions src/common/primitive_hashing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ size_t get_md_hash(const memory_desc_t &md) {
& dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
seed = hash_combine(seed, md.extra.asymm_compensation_mask);
}

if (md.extra.flags
& dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
seed = get_array_hash(seed, md.extra.idhw, 3);
seed = get_array_hash(seed, md.extra.odhw, 3);
seed = get_array_hash(seed, md.extra.pdhw, 3);
seed = get_array_hash(seed, md.extra.ddhw, 3);
}
}
// Combined hash for a memory descriptor
return seed;
Expand Down
7 changes: 7 additions & 0 deletions src/common/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) {
& dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
sstream.write(&md.extra.asymm_compensation_mask);
}
if (md.extra.flags
& dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
sstream.write(md.extra.idhw, 3);
sstream.write(md.extra.odhw, 3);
sstream.write(md.extra.pdhw, 3);
sstream.write(md.extra.ddhw, 3);
}
}
}

Expand Down
7 changes: 6 additions & 1 deletion src/common/type_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,12 @@ inline bool memory_extra_desc_is_equal(
&& IMPLICATION(lhs.flags & scale_adjust,
lhs.scale_adjust == rhs.scale_adjust)
&& IMPLICATION(lhs.flags & compensation_conv_asymmetric_src,
lhs.asymm_compensation_mask == rhs.asymm_compensation_mask);
lhs.asymm_compensation_mask == rhs.asymm_compensation_mask)
&& IMPLICATION(lhs.flags & compensation_gpu_conv_asymmetric_src,
utils::array_cmp(lhs.idhw, rhs.idhw, 3)
&& utils::array_cmp(lhs.odhw, rhs.odhw, 3)
&& utils::array_cmp(lhs.pdhw, rhs.pdhw, 3)
&& utils::array_cmp(lhs.ddhw, rhs.ddhw, 3));
}

inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md,
Expand Down
14 changes: 14 additions & 0 deletions src/common/verbose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,20 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) {
ss << ":s8m" << extra.compensation_mask;
if (extra.flags & compensation_conv_asymmetric_src)
ss << ":zpm" << extra.asymm_compensation_mask;
if (extra.flags & compensation_gpu_conv_asymmetric_src) {
ss << ":zid" << extra.idhw[0];
ss << ":zih" << extra.idhw[1];
ss << ":ziw" << extra.idhw[2];
ss << ":zod" << extra.odhw[0];
ss << ":zoh" << extra.odhw[1];
ss << ":zow" << extra.odhw[2];
ss << ":zpd" << extra.pdhw[0];
ss << ":zph" << extra.pdhw[1];
ss << ":zpw" << extra.pdhw[2];
ss << ":zdd" << extra.ddhw[0];
ss << ":zdh" << extra.ddhw[1];
ss << ":zdw" << extra.ddhw[2];
}
if (extra.flags & scale_adjust && extra.scale_adjust != 1.f)
ss << ":sa" << extra.scale_adjust;
return ss;
Expand Down
3 changes: 3 additions & 0 deletions src/cpu/reorder/cpu_reorder_pd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ struct cpu_reorder_pd_t : public reorder_pd_t {
post_ops.len() == 1
&& post_ops.entry_[0].kind == primitive_kind::sum);
VDISPATCH_REORDER(args_ok, VERBOSE_UNSUPPORTED_POSTOP);
auto gpu_zp = memory_extra_flags::compensation_gpu_conv_asymmetric_src;
VDISPATCH_REORDER(!(dst_md()->extra.flags & gpu_zp),
VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
return status::success;
}

Expand Down
7 changes: 5 additions & 2 deletions src/gpu/generic/convolution_deconvolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ namespace generic {

static status_t weights_axes_permutation(
memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) {
using namespace memory_extra_flags;
int perm[DNNL_MAX_NDIMS] {}; // deconv to conv weight permutation
for (int d = 0; d < DNNL_MAX_NDIMS; ++d)
perm[d] = d;
nstl::swap(perm[0 + with_groups], perm[1 + with_groups]);

return memory_desc_permute_axes(*o_md, *i_md, perm);
CHECK(memory_desc_permute_axes(*o_md, *i_md, perm));
if (o_md->extra.flags & compensation_gpu_conv_asymmetric_src)
o_md->extra.flags |= compensation_gpu_conv_asymmetric_src_swap;
return status::success;
}

static status_t conv_descr_create(
Expand Down
50 changes: 31 additions & 19 deletions src/gpu/generic/cross_engine_reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,24 @@
#include "gpu/generic/cross_engine_reorder.hpp"
#include "gpu/gpu_engine.hpp"
#include "gpu/gpu_stream.hpp"
#include "gpu/gpu_utils.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {

void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *engine) {
using namespace memory_tracking::names;
if (!do_reorder_) return;

auto *gpu_engine = utils::downcast<gpu::engine_t *>(engine);

const memory_desc_wrapper wspace_md(
desc()->src_engine_kind == reorder_engine_kind_ ? dst_md()
: src_md());
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_reorder_cross_space,
wspace_md.size(), 1, gpu_engine->get_buffer_alignment());
scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), 1,
gpu_engine->get_buffer_alignment());
void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *gpu_engine) {
if (do_reorder_) {
using namespace memory_tracking::names;
auto gpu_align = utils::downcast<gpu::engine_t *>(gpu_engine)
->get_buffer_alignment();
auto scratchpad = scratchpad_registry().registrar();
auto needs_dst = desc()->src_engine_kind == reorder_engine_kind_;
memory_desc_wrapper wspace((needs_dst) ? dst_md() : src_md());
scratchpad.book(key_reorder_cross_space, wspace.size(), 1, gpu_align);
scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(),
1, gpu_align);
}
}

status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
Expand All @@ -50,7 +47,7 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
dst_engine->kind()),
VERBOSE_BAD_ENGINE_KIND);
VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR);
VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");

memory_desc_wrapper src_mdw(src_md());
memory_desc_wrapper dst_mdw(dst_md());
Expand All @@ -72,17 +69,30 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
primitive_attr_t r_attr(*attr());
if (!r_attr.is_initialized()) return status::out_of_memory;

VDISPATCH_REORDER_SC(reorder_primitive_desc_create(reorder_pd_,
reorder_engine, src_md(), dst_md(), &r_attr),
auto clean_src_md = *src_md();
auto clean_dst_md = *dst_md();
clean_src_md.extra = clean_dst_md.extra = {};
VDISPATCH_REORDER_SC(
reorder_primitive_desc_create(reorder_pd_, reorder_engine,
&clean_src_md, &clean_dst_md, &r_attr),
VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder");
init_scratchpad(engine);

reorder_pd_t::init_desc(
src_engine->kind(), dst_engine->kind(), true /* is_cross_engine */);

CHECK(maybe_create_zp_precompute_conv_pd(dst_engine));
init_scratchpad(
(dst_engine->kind() == engine_kind::gpu) ? dst_engine : src_engine);
return status::success;
}

status_t cross_engine_reorder_t::init(impl::engine_t *engine) {
CHECK(pd()->maybe_create_zp_precompute_conv(
zp_precomp_conv_, engine, this));
if (!pd()->do_reorder_) return status::success;
return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
}

status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;
auto *gpu_stream = utils::downcast<gpu::stream_t *>(ctx.stream());
Expand Down Expand Up @@ -158,6 +168,8 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC),
ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST));
}
if (status == status::success)
status = pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_);
}
return status;
}
Expand Down
7 changes: 2 additions & 5 deletions src/gpu/generic/cross_engine_reorder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,13 @@ struct cross_engine_reorder_t : public gpu::primitive_t {
DECLARE_GPU_REORDER_CREATE();
};

status_t init(impl::engine_t *engine) override {
if (!pd()->do_reorder_) return status::success;
return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
}

status_t init(impl::engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::shared_ptr<impl::primitive_t> reorder_;
std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
};

} // namespace generic
Expand Down
Loading

0 comments on commit 2a37b6b

Please sign in to comment.