From 0993edf7df99e89d89b522a983336a78fda5c402 Mon Sep 17 00:00:00 2001 From: "Guskov, Andrey Y" Date: Thu, 12 Dec 2024 15:53:16 -0800 Subject: [PATCH 1/2] src: common: make rnn_s8s8_compensation a power of 2 --- src/common/memory_desc.hpp | 14 ++++++------- src/common/memory_desc_wrapper.hpp | 28 +++++++++----------------- src/common/primitive_hashing.cpp | 8 +++----- src/common/serialization.cpp | 10 +++------ src/common/type_helpers.hpp | 19 +++-------------- src/cpu/rnn/rnn_reorders.hpp | 11 ++-------- src/gpu/intel/ocl/rnn/rnn_reorders.hpp | 12 +++-------- 7 files changed, 31 insertions(+), 71 deletions(-) diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp index 468a8528ec2..5dc820c67c1 100644 --- a/src/common/memory_desc.hpp +++ b/src/common/memory_desc.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ const rnn_packed_memory_format_t ldio_p = rnn_packed_memory_format_t::ldio_p; // TODO: convert to 'enum class'. // Flags for memory special features enum memory_extra_flags_t { - dnnl_memory_extra_flag_none = 0x0U, + dnnl_memory_extra_flag_none = 0u, // Indicates the weights have an additional buffer, that depends on the // @p compensation_mask. // @@ -64,13 +64,13 @@ enum memory_extra_flags_t { // the additional buffer would consist of OC values: // O[oc : 0,OC] = // -128 * SUM(ic : 0,IC; kh : 0,KH; kw : 0,KW){ weights(oc, ic, kh, kw) } - dnnl_memory_extra_flag_compensation_conv_s8s8 = 0x1U, - dnnl_memory_extra_flag_scale_adjust = 0x2U, - dnnl_memory_extra_flag_rnn_u8s8_compensation = 0x4U, + dnnl_memory_extra_flag_compensation_conv_s8s8 = 1u, + dnnl_memory_extra_flag_scale_adjust = 2u, + dnnl_memory_extra_flag_rnn_u8s8_compensation = 4u, dnnl_memory_extra_flag_gpu_rnn_u8s8_compensation = dnnl_memory_extra_flag_rnn_u8s8_compensation, - dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 0x8U, - dnnl_memory_extra_flag_rnn_s8s8_compensation = 0x16U, + dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u, + dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u, }; // Create aliases for extra flags to preserve the old behavior. diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp index 5cf2e2f66ba..847951ba558 100644 --- a/src/common/memory_desc_wrapper.hpp +++ b/src/common/memory_desc_wrapper.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -149,9 +149,7 @@ struct memory_desc_wrapper : public c_compatible { size_t additional_buffer_data_size(uint64_t flag_select) const { using namespace memory_extra_flags; if (flag_select & compensation_conv_s8s8) return sizeof(int32_t); - if ((flag_select & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set(flag_select)) - return sizeof(float); + if (flag_select & rnn_u8s8_compensation) return sizeof(float); if (flag_select & compensation_conv_asymmetric_src) return sizeof(int32_t); return 0; @@ -160,19 +158,16 @@ struct memory_desc_wrapper : public c_compatible { /** return true if memory format has additional buffer */ bool is_additional_buffer() const { using namespace memory_extra_flags; - // Currently compensation is not required for rnn_s8s8_compensation, - // but it has common bit with rnn_u8s8_compensation constant so we have - // to exclude rnn_s8s8_compensation case explicitly - return ((extra().flags - & (compensation_conv_s8s8 | rnn_u8s8_compensation - | compensation_conv_asymmetric_src)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - extra().flags)); + return extra().flags + & (compensation_conv_s8s8 | rnn_u8s8_compensation + | compensation_conv_asymmetric_src); } /** returns the size required for a particular extra memory buffer */ size_t additional_buffer_size(memory_extra_flags_t flag) const { using namespace memory_extra_flags; + const auto flags = extra().flags; + if (!(flags & flag)) return 0; const auto ndims = this->ndims(); const auto &pdims = padded_dims(); @@ -186,18 +181,15 @@ struct memory_desc_wrapper : public c_compatible { return (size_t)prod * buff_data_size; }; - if (extra().flags & compensation_conv_s8s8) { + if (flag == compensation_conv_s8s8) { return calculate_size(extra().compensation_mask, additional_buffer_data_size(flag)); } - - if ((extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - extra().flags)) { + if (flag == rnn_u8s8_compensation) { return calculate_size(extra().compensation_mask, additional_buffer_data_size(flag)); } - if (extra().flags & compensation_conv_asymmetric_src) { + if (flag == compensation_conv_asymmetric_src) { return calculate_size(extra().asymm_compensation_mask, additional_buffer_data_size(flag)); } diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index 7c51d4d5de3..a8d9f25ce8c 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -190,11 +190,9 @@ size_t get_md_hash(const memory_desc_t &md) { if (md.extra.flags != dnnl_memory_extra_flag_none) { seed = hash_combine(seed, md.extra.flags); - if ((md.extra.flags - & (dnnl_memory_extra_flag_compensation_conv_s8s8 - | dnnl_memory_extra_flag_rnn_u8s8_compensation)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - md.extra.flags)) { + if (md.extra.flags + & (dnnl_memory_extra_flag_compensation_conv_s8s8 + | dnnl_memory_extra_flag_rnn_u8s8_compensation)) { seed = hash_combine(seed, md.extra.compensation_mask); } diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index fe43c2e2efc..8e40dd29819 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -120,18 +120,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) { if (md.extra.flags != dnnl_memory_extra_flag_none) { sstream.write(&md.extra.flags); - if ((md.extra.flags - & (dnnl_memory_extra_flag_compensation_conv_s8s8 - | dnnl_memory_extra_flag_rnn_u8s8_compensation)) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - md.extra.flags)) { + if (md.extra.flags + & (dnnl_memory_extra_flag_compensation_conv_s8s8 + | dnnl_memory_extra_flag_rnn_u8s8_compensation)) { sstream.write(&md.extra.compensation_mask); } - if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) { sstream.write(&md.extra.scale_adjust); } - if (md.extra.flags & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { sstream.write(&md.extra.asymm_compensation_mask); diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index ef617b4731d..7a6efb9d986 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -299,28 +299,15 @@ inline format_kind_t format_tag_to_kind(format_tag_t tag) { return format_kind::undef; } -// Currently rnn_s8s8_compensation has common bits with rnn_u8s8_compensation -// and scale_adjust constants so we have to perform additional checks to -// separate these two cases -inline bool extra_flag_rnn_s8s8_compensation_is_set(uint64_t flags) { - return ((flags & memory_extra_flags::rnn_s8s8_compensation) - ^ memory_extra_flags::rnn_s8s8_compensation) - == 0; -} - inline bool memory_extra_desc_is_equal( const memory_extra_desc_t &lhs, const memory_extra_desc_t &rhs) { using namespace memory_extra_flags; - return true && lhs.flags == rhs.flags + return lhs.flags == rhs.flags && IMPLICATION(lhs.flags & compensation_conv_s8s8, lhs.compensation_mask == rhs.compensation_mask) - && IMPLICATION((lhs.flags & rnn_u8s8_compensation) - && !extra_flag_rnn_s8s8_compensation_is_set( - lhs.flags), + && IMPLICATION(lhs.flags & rnn_u8s8_compensation, lhs.compensation_mask == rhs.compensation_mask) - && IMPLICATION((lhs.flags & scale_adjust) - && !extra_flag_rnn_s8s8_compensation_is_set( - lhs.flags), + && IMPLICATION(lhs.flags & scale_adjust, lhs.scale_adjust == rhs.scale_adjust) && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src, lhs.asymm_compensation_mask == rhs.asymm_compensation_mask); diff --git a/src/cpu/rnn/rnn_reorders.hpp b/src/cpu/rnn/rnn_reorders.hpp index 5156350d860..e96828d369c 100644 --- a/src/cpu/rnn/rnn_reorders.hpp +++ b/src/cpu/rnn/rnn_reorders.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2018-2024 Intel Corporation +* Copyright 2018-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -779,12 +779,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t { return unimplemented; // Check the proper memory desc has been passed to u8s8 and s8s8 - // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation - // have common bit so we have to perform additional checks to - // separate these two cases const bool check_u8s8 = (od.extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - od.extra().flags) && od.extra().compensation_mask == ((id.ndims() == 5) ? 27 /* 11011 */ : 13 /* 1101 */); @@ -886,9 +881,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t { .template get(memory_tracking::names:: key_reorder_rnn_weights_reduction); float *comp = reinterpret_cast(dst + compensation_offset); - const bool req_s8s8_comp = (dst_d.extra().flags & rnn_u8s8_compensation) - && !types::extra_flag_rnn_s8s8_compensation_is_set( - dst_d.extra().flags); + const bool req_s8s8_comp = dst_d.extra().flags & rnn_u8s8_compensation; const auto mask_ok = [&](int mask) { return mask == ((src_d.ndims() == 5) ? 27 /* 11011 */ diff --git a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp index 80f1ed4c0b3..5b72142ce0a 100644 --- a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp +++ b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,14 +42,8 @@ struct rnn_weights_reorder_t : public gpu_primitive_t { status_t init(impl::engine_t *engine, impl::engine_t *src_engine, impl::engine_t *dst_engine) { - // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation - // have common bit so we have to perform additional checks to - // separate these two cases - VDISPATCH_REORDER( - !IMPLICATION(dst_md()->extra.flags - & memory_extra_flags::rnn_u8s8_compensation, - types::extra_flag_rnn_s8s8_compensation_is_set( - dst_md()->extra.flags)), + VDISPATCH_REORDER(dst_md()->extra.flags + & memory_extra_flags::rnn_u8s8_compensation, VERBOSE_BAD_FLAGS); VDISPATCH_REORDER(utils::one_of(src_engine->kind(), From ce8008183a8c7ebd41f1edf4c418ffd96af5074d Mon Sep 17 00:00:00 2001 From: "Guskov, Andrey Y" Date: Wed, 8 Jan 2025 23:30:10 -0800 Subject: [PATCH 2/2] src: gpu: intel: jit: conv: add reorder-based precomputed zero points --- src/common/memory_desc.cpp | 7 +- src/common/memory_desc.hpp | 41 ++++- src/common/memory_desc_wrapper.hpp | 8 + src/common/primitive_hashing.cpp | 9 + src/common/serialization.cpp | 8 + src/common/type_helpers.hpp | 8 +- src/common/verbose.cpp | 15 ++ src/cpu/reorder/cpu_reorder_pd.hpp | 5 +- src/gpu/generic/convolution_deconvolution.hpp | 9 +- src/gpu/generic/cross_engine_reorder.cpp | 52 ++++-- src/gpu/generic/cross_engine_reorder.hpp | 9 +- src/gpu/gpu_reorder_pd.cpp | 101 +++++++++++ src/gpu/gpu_reorder_pd.hpp | 32 +++- src/gpu/gpu_utils.hpp | 2 +- src/gpu/gpu_zero_points_conv.cpp | 96 +++++++++++ src/gpu/gpu_zero_points_conv.hpp | 36 ++++ src/gpu/intel/jit/codegen/kernel.hpp | 57 ++++-- src/gpu/intel/jit/codegen/reorder.hpp | 12 +- src/gpu/intel/jit/conv/config.cpp | 77 +++++++++ src/gpu/intel/jit/conv/config.hpp | 4 +- src/gpu/intel/jit/conv/gen_convolution.cpp | 163 ++++++++---------- src/gpu/intel/jit/conv/normalization.cpp | 27 +-- src/gpu/intel/jit/conv/normalization.hpp | 5 +- src/gpu/intel/jit/conv/zp_plan.cpp | 12 +- src/gpu/intel/jit/conv/zp_plan.hpp | 3 +- src/gpu/intel/jit/ir/epilogue.cpp | 3 + src/gpu/intel/jit/ir/kernel_info.hpp | 6 +- src/gpu/intel/jit/ir/post_ops.cpp | 25 ++- src/gpu/intel/jit/ir/post_ops.hpp | 29 +++- src/gpu/intel/jit/ir/tensor_config.cpp | 10 +- src/gpu/intel/jit/reorder/gen_reorder.cpp | 12 +- src/gpu/intel/jit/reorder/gen_reorder.hpp | 3 +- src/gpu/intel/ocl/ref_reorder.cpp | 26 +-- src/gpu/intel/ocl/ref_reorder.hpp | 10 +- 34 files changed, 703 insertions(+), 219 deletions(-) create mode 100644 src/gpu/gpu_reorder_pd.cpp create mode 100644 src/gpu/gpu_zero_points_conv.cpp create mode 100644 src/gpu/gpu_zero_points_conv.hpp diff --git a/src/common/memory_desc.cpp b/src/common/memory_desc.cpp index f9345a72302..5d5a0958b52 100644 --- a/src/common/memory_desc.cpp +++ b/src/common/memory_desc.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -471,8 +471,9 @@ status_t memory_desc_permute_axes(memory_desc_t &out_memory_desc, VCHECK_MEMORY( !memory_desc_wrapper(in_memory_desc).has_runtime_dims_or_strides(), invalid_arguments, VERBOSE_UNSUPPORTED_MEM_STRIDE); - VCHECK_MEMORY(in_memory_desc.extra.flags == 0, invalid_arguments, - VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); + VCHECK_MEMORY( + check_md_extra_flags_compensation_gpu(in_memory_desc.extra.flags), + invalid_arguments, VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); // verify that perm is indeed a permutation of [0 .. ndims) unsigned occurrence_mask = 0; diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp index 5dc820c67c1..3b9dd8d0b1e 100644 --- a/src/common/memory_desc.hpp +++ b/src/common/memory_desc.hpp @@ -71,6 +71,15 @@ enum memory_extra_flags_t { = dnnl_memory_extra_flag_rnn_u8s8_compensation, dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u, dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u, + // This flag has to be kept separate from *compensation_conv_asymmetric_src + // since the GPU precompute algorithm is incompatible with that of the CPU + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src = 32u, + // This flag depends on *compensation_gpu_conv_asymmetric_src and is used + // when precompute is to be performed for a backward-by-data convolution + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd = 64u, + // This flag depends on *compensation_gpu_conv_asymmetric_src and is used + // when IC and OC are swapped to reinterpret a deconv as a BWD_D conv + dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap = 128u, }; // Create aliases for extra flags to preserve the old behavior. @@ -87,8 +96,23 @@ const memory_extra_flags_t rnn_s8s8_compensation = dnnl_memory_extra_flag_rnn_s8s8_compensation; const memory_extra_flags_t compensation_conv_asymmetric_src = dnnl_memory_extra_flag_compensation_conv_asymmetric_src; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_bwd + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd; +const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_swap + = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap; } // namespace memory_extra_flags +inline bool check_md_extra_flags_compensation_gpu(uint64_t flags) { + using namespace memory_extra_flags; + const uint64_t c = compensation_gpu_conv_asymmetric_src; + const uint64_t b = compensation_gpu_conv_asymmetric_src_bwd; + const uint64_t s = compensation_gpu_conv_asymmetric_src_swap; + return (flags == none) || (flags == c) || (flags == (c | b)) + || (flags == (c | b | s)); +} + // Generic description of blocked data layout for most memory formats. struct blocking_desc_t { // The strides between the outermost blocks. @@ -208,7 +232,12 @@ struct memory_extra_desc_t { : flags(0) , compensation_mask(0) , scale_adjust(0.0f) - , asymm_compensation_mask(0) {} + , asymm_compensation_mask(0) + , idhw {0, 0, 0} + , odhw {0, 0, 0} + , pdhw {0, 0, 0} + , ddhw {0, 0, 0} + , dst_size(0) {} // The flags contain arbitrary extra information, such as compensation. // @sa dnnl_memory_extra_flags_t uint64_t flags; @@ -218,6 +247,16 @@ struct memory_extra_desc_t { float scale_adjust; // Compensation mask for asymmetric quantization int asymm_compensation_mask; + // Precomp GPU ZP convolution input spatials + dim_t idhw[3]; + // Precomp GPU ZP convolution output spatials + dim_t odhw[3]; + // Precomp GPU ZP convolution padding spatials + dim_t pdhw[3]; + // Precomp GPU ZP convolution dilation spatials + dim_t ddhw[3]; + // Precomp GPU ZP convolution destination size + dim_t dst_size; }; status_t DNNL_API memory_desc_init_by_tag(memory_desc_t &memory_desc, int ndims, diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp index 847951ba558..9b32468975b 100644 --- a/src/common/memory_desc_wrapper.hpp +++ b/src/common/memory_desc_wrapper.hpp @@ -152,6 +152,8 @@ struct memory_desc_wrapper : public c_compatible { if (flag_select & rnn_u8s8_compensation) return sizeof(float); if (flag_select & compensation_conv_asymmetric_src) return sizeof(int32_t); + if (flag_select & compensation_gpu_conv_asymmetric_src) + return sizeof(int32_t); return 0; } @@ -160,6 +162,7 @@ struct memory_desc_wrapper : public c_compatible { using namespace memory_extra_flags; return extra().flags & (compensation_conv_s8s8 | rnn_u8s8_compensation + | compensation_gpu_conv_asymmetric_src | compensation_conv_asymmetric_src); } @@ -193,6 +196,9 @@ struct memory_desc_wrapper : public c_compatible { return calculate_size(extra().asymm_compensation_mask, additional_buffer_data_size(flag)); } + if (flag == compensation_gpu_conv_asymmetric_src) { + return extra().dst_size; + } return 0; } @@ -212,6 +218,8 @@ struct memory_desc_wrapper : public c_compatible { buff_size += additional_buffer_size(compensation_conv_s8s8); buff_size += additional_buffer_size(rnn_u8s8_compensation); buff_size += additional_buffer_size(compensation_conv_asymmetric_src); + buff_size + += additional_buffer_size(compensation_gpu_conv_asymmetric_src); return buff_size; } diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp index a8d9f25ce8c..a7a0f9ed295 100644 --- a/src/common/primitive_hashing.cpp +++ b/src/common/primitive_hashing.cpp @@ -204,6 +204,15 @@ size_t get_md_hash(const memory_desc_t &md) { & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { seed = hash_combine(seed, md.extra.asymm_compensation_mask); } + + if (md.extra.flags + & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) { + seed = get_array_hash(seed, md.extra.idhw, 3); + seed = get_array_hash(seed, md.extra.odhw, 3); + seed = get_array_hash(seed, md.extra.pdhw, 3); + seed = get_array_hash(seed, md.extra.ddhw, 3); + seed = hash_combine(seed, md.extra.dst_size); + } } // Combined hash for a memory descriptor return seed; diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp index 8e40dd29819..afe9c37f49e 100644 --- a/src/common/serialization.cpp +++ b/src/common/serialization.cpp @@ -132,6 +132,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) { & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) { sstream.write(&md.extra.asymm_compensation_mask); } + if (md.extra.flags + & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) { + sstream.write(md.extra.idhw, 3); + sstream.write(md.extra.odhw, 3); + sstream.write(md.extra.pdhw, 3); + sstream.write(md.extra.ddhw, 3); + sstream.write(&md.extra.dst_size); + } } } diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp index 7a6efb9d986..c8abbbe4364 100644 --- a/src/common/type_helpers.hpp +++ b/src/common/type_helpers.hpp @@ -310,7 +310,13 @@ inline bool memory_extra_desc_is_equal( && IMPLICATION(lhs.flags & scale_adjust, lhs.scale_adjust == rhs.scale_adjust) && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src, - lhs.asymm_compensation_mask == rhs.asymm_compensation_mask); + lhs.asymm_compensation_mask == rhs.asymm_compensation_mask) + && IMPLICATION(lhs.flags & compensation_gpu_conv_asymmetric_src, + (lhs.dst_size == rhs.dst_size) + && utils::array_cmp(lhs.idhw, rhs.idhw, 3) + && utils::array_cmp(lhs.odhw, rhs.odhw, 3) + && utils::array_cmp(lhs.pdhw, rhs.pdhw, 3) + && utils::array_cmp(lhs.ddhw, rhs.ddhw, 3)); } inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md, diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp index 63a9df9a2e1..76dfb31f46b 100644 --- a/src/common/verbose.cpp +++ b/src/common/verbose.cpp @@ -414,6 +414,21 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) { ss << ":s8m" << extra.compensation_mask; if (extra.flags & compensation_conv_asymmetric_src) ss << ":zpm" << extra.asymm_compensation_mask; + if (extra.flags & compensation_gpu_conv_asymmetric_src) { + ss << ":zid" << extra.idhw[0]; + ss << ":zih" << extra.idhw[1]; + ss << ":ziw" << extra.idhw[2]; + ss << ":zod" << extra.odhw[0]; + ss << ":zoh" << extra.odhw[1]; + ss << ":zow" << extra.odhw[2]; + ss << ":zpd" << extra.pdhw[0]; + ss << ":zph" << extra.pdhw[1]; + ss << ":zpw" << extra.pdhw[2]; + ss << ":zdd" << extra.ddhw[0]; + ss << ":zdh" << extra.ddhw[1]; + ss << ":zdw" << extra.ddhw[2]; + ss << ":zs" << extra.dst_size; + } if (extra.flags & scale_adjust && extra.scale_adjust != 1.f) ss << ":sa" << extra.scale_adjust; return ss; diff --git a/src/cpu/reorder/cpu_reorder_pd.hpp b/src/cpu/reorder/cpu_reorder_pd.hpp index d1c8499c151..ca69992b0fe 100644 --- a/src/cpu/reorder/cpu_reorder_pd.hpp +++ b/src/cpu/reorder/cpu_reorder_pd.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,9 @@ struct cpu_reorder_pd_t : public reorder_pd_t { post_ops.len() == 1 && post_ops.entry_[0].kind == primitive_kind::sum); VDISPATCH_REORDER(args_ok, VERBOSE_UNSUPPORTED_POSTOP); + auto gpu_zp = memory_extra_flags::compensation_gpu_conv_asymmetric_src; + VDISPATCH_REORDER(!(dst_md()->extra.flags & gpu_zp), + VERBOSE_UNSUPPORTED_MD_FLAG, "extra"); return status::success; } diff --git a/src/gpu/generic/convolution_deconvolution.hpp b/src/gpu/generic/convolution_deconvolution.hpp index 74893d4c5db..1c07d94522d 100644 --- a/src/gpu/generic/convolution_deconvolution.hpp +++ b/src/gpu/generic/convolution_deconvolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,12 +32,15 @@ namespace generic { static status_t weights_axes_permutation( memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) { + using namespace memory_extra_flags; int perm[DNNL_MAX_NDIMS] {}; // deconv to conv weight permutation for (int d = 0; d < DNNL_MAX_NDIMS; ++d) perm[d] = d; nstl::swap(perm[0 + with_groups], perm[1 + with_groups]); - - return memory_desc_permute_axes(*o_md, *i_md, perm); + CHECK(memory_desc_permute_axes(*o_md, *i_md, perm)); + if (o_md->extra.flags & compensation_gpu_conv_asymmetric_src) + o_md->extra.flags |= compensation_gpu_conv_asymmetric_src_swap; + return status::success; } static status_t conv_descr_create( diff --git a/src/gpu/generic/cross_engine_reorder.cpp b/src/gpu/generic/cross_engine_reorder.cpp index 6ded618a9c9..cbf4672c4c6 100644 --- a/src/gpu/generic/cross_engine_reorder.cpp +++ b/src/gpu/generic/cross_engine_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,20 +27,18 @@ namespace impl { namespace gpu { namespace generic { -void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *engine) { - using namespace memory_tracking::names; - if (!do_reorder_) return; - - auto *gpu_engine = utils::downcast(engine); - - const memory_desc_wrapper wspace_md( - desc()->src_engine_kind == reorder_engine_kind_ ? dst_md() - : src_md()); - auto scratchpad = scratchpad_registry().registrar(); - scratchpad.book(memory_tracking::names::key_reorder_cross_space, - wspace_md.size(), 1, gpu_engine->get_buffer_alignment()); - scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), 1, - gpu_engine->get_buffer_alignment()); +void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *gpu_engine) { + if (do_reorder_) { + using namespace memory_tracking::names; + auto gpu_align = utils::downcast(gpu_engine) + ->get_buffer_alignment(); + auto scratchpad = scratchpad_registry().registrar(); + auto needs_dst = desc()->src_engine_kind == reorder_engine_kind_; + memory_desc_wrapper wspace((needs_dst) ? dst_md() : src_md()); + scratchpad.book(key_reorder_cross_space, wspace.size(), 1, gpu_align); + scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), + 1, gpu_align); + } } status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, @@ -50,7 +48,7 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, dst_engine->kind()), VERBOSE_BAD_ENGINE_KIND); VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); + VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); memory_desc_wrapper src_mdw(src_md()); memory_desc_wrapper dst_mdw(dst_md()); @@ -72,17 +70,31 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine, primitive_attr_t r_attr(*attr()); if (!r_attr.is_initialized()) return status::out_of_memory; - VDISPATCH_REORDER_SC(reorder_primitive_desc_create(reorder_pd_, - reorder_engine, src_md(), dst_md(), &r_attr), + auto clean_src_md = *src_md(); + auto clean_dst_md = *dst_md(); + clean_src_md.extra = clean_dst_md.extra = {}; + VDISPATCH_REORDER_SC( + reorder_primitive_desc_create(reorder_pd_, reorder_engine, + &clean_src_md, &clean_dst_md, &r_attr), VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder"); - init_scratchpad(engine); reorder_pd_t::init_desc( src_engine->kind(), dst_engine->kind(), true /* is_cross_engine */); + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); + init_scratchpad( + (dst_engine->kind() == engine_kind::gpu) ? dst_engine : src_engine); return status::success; } +status_t cross_engine_reorder_t::init(impl::engine_t *engine) { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + if (!pd()->do_reorder_) return status::success; + return create_nested_primitive(reorder_, pd()->reorder_pd_, engine); +} + status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const { using namespace memory_tracking::names; auto *gpu_stream = utils::downcast(ctx.stream()); @@ -158,6 +170,8 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const { ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC), ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)); } + if (status == status::success) + status = pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_); } return status; } diff --git a/src/gpu/generic/cross_engine_reorder.hpp b/src/gpu/generic/cross_engine_reorder.hpp index cd69fefefaf..c6557ddaaeb 100644 --- a/src/gpu/generic/cross_engine_reorder.hpp +++ b/src/gpu/generic/cross_engine_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,16 +57,13 @@ struct cross_engine_reorder_t : public gpu::primitive_t { DECLARE_GPU_REORDER_CREATE(); }; - status_t init(impl::engine_t *engine) override { - if (!pd()->do_reorder_) return status::success; - return create_nested_primitive(reorder_, pd()->reorder_pd_, engine); - } - + status_t init(impl::engine_t *engine) override; status_t execute(const exec_ctx_t &ctx) const override; private: const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } std::shared_ptr reorder_; + std::shared_ptr zp_precomp_conv_; }; } // namespace generic diff --git a/src/gpu/gpu_reorder_pd.cpp b/src/gpu/gpu_reorder_pd.cpp new file mode 100644 index 00000000000..ca293db5c89 --- /dev/null +++ b/src/gpu/gpu_reorder_pd.cpp @@ -0,0 +1,101 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/gpu_reorder_pd.hpp" +#include "gpu/gpu_engine.hpp" +#include "gpu/gpu_stream.hpp" +#include "gpu/gpu_zero_points_conv.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv_pd( + impl::engine_t *dst_engine) { + memory_desc_wrapper dst_mdw(dst_md()); + auto &extra = dst_mdw.extra(); + auto needs_conv = memory_extra_flags::compensation_gpu_conv_asymmetric_src; + auto is_dst_gpu = (dst_engine->kind() == engine_kind::gpu); + do_zp_precomp_conv_ = is_dst_gpu && (extra.flags & needs_conv); + if (!do_zp_precomp_conv_) return status::success; + + using namespace memory_extra_flags; + const auto out_type = data_type::f32; + primitive_attr_t attr; + const bool is_bwd_d + = extra.flags & compensation_gpu_conv_asymmetric_src_bwd; + auto prop = (is_bwd_d) ? prop_kind::backward_data + : prop_kind::forward_inference; + CHECK(create_zp_precompute_conv_pd(zp_precomp_conv_pd_, dst_engine, attr, + dst_md(), extra.idhw, extra.odhw, extra.pdhw, extra.ddhw, out_type, + prop)); + + using namespace memory_tracking::names; + auto gpu_align = utils::downcast(dst_engine) + ->get_buffer_alignment(); + auto scratchpad = scratchpad_registry().registrar(); + auto registry = zp_precomp_conv_pd_->scratchpad_registry(); + memory_desc_wrapper wspace((is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md() + : zp_precomp_conv_pd_->src_md()); + scratchpad.book(key_conv_tr_src, wspace.size(), 1, gpu_align); + scratchpad.book(key_conv_tails, registry.size(), 1, gpu_align); + return status::success; +} + +status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv( + std::shared_ptr &zp_precomp_conv, + impl::engine_t *engine, gpu::primitive_t *primitive) const { + if (!do_zp_precomp_conv_) return status::success; + return primitive->create_nested_primitive( + zp_precomp_conv, zp_precomp_conv_pd_, engine); +} + +status_t gpu_reorder_pd_t::maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx, + const std::shared_ptr &zp_precomp_conv) const { + using namespace memory_tracking::names; + if (!do_zp_precomp_conv_) return status::success; + + const bool is_bwd_d = (zp_precomp_conv_pd_->get_prop_kind() + == prop_kind::backward_data); + auto *gpu_stream = utils::downcast(ctx.stream()); + auto conv_md_in = (is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md() + : zp_precomp_conv_pd_->src_md(); + auto scratchpad + = ctx.get_scratchpad_grantor().get_memory_storage(key_conv_tr_src); + std::unique_ptr wspace; + CHECK(safe_ptr_assign(wspace, + new memory_t(ctx.stream()->engine(), conv_md_in, + std::move(scratchpad)))); + CHECK(gpu_stream->fill(*wspace->memory_storage(), 0x01, + memory_desc_wrapper(conv_md_in).size(), + gpu_stream->ctx().get_deps(), gpu_stream->ctx().get_deps())); + + exec_args_t r_args; + auto arg_in = (is_bwd_d) ? DNNL_ARG_DIFF_DST : DNNL_ARG_SRC; + auto arg_out = (is_bwd_d) ? DNNL_ARG_DIFF_SRC : DNNL_ARG_DST; + r_args[arg_in] = memory_arg_t {(memory_t *)wspace.get(), true}; + r_args[DNNL_ARG_WEIGHTS] = memory_arg_t {ctx.output(DNNL_ARG_TO), true}; + r_args[arg_out] = memory_arg_t {ctx.output(DNNL_ARG_TO), false}; + exec_ctx_t r_ctx(ctx, std::move(r_args)); + + nested_scratchpad_t ns(ctx, key_conv_tails, zp_precomp_conv); + r_ctx.set_scratchpad_grantor(ns.grantor()); + return zp_precomp_conv->execute(r_ctx); +} + +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/gpu_reorder_pd.hpp b/src/gpu/gpu_reorder_pd.hpp index d70c28bdd81..71617d96dc8 100644 --- a/src/gpu/gpu_reorder_pd.hpp +++ b/src/gpu/gpu_reorder_pd.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #define GPU_GPU_REORDER_PD_HPP #include "common/reorder_pd.hpp" +#include "gpu/gpu_primitive.hpp" namespace dnnl { namespace impl { @@ -28,10 +29,9 @@ struct gpu_reorder_pd_t : public reorder_pd_t { protected: bool attr_ok() const { - return attr()->has_default_values( - dnnl_primitive_attr::skip_mask_t::zero_points_runtime - | dnnl_primitive_attr::skip_mask_t::scales_runtime - | dnnl_primitive_attr::skip_mask_t::post_ops) + using sm = dnnl_primitive_attr::skip_mask_t; + return attr()->has_default_values(sm::zero_points_runtime + | sm::scales_runtime | sm::post_ops) && post_ops_ok() && zero_points_ok(); } @@ -62,9 +62,27 @@ struct gpu_reorder_pd_t : public reorder_pd_t { && post_ops.entry_[0].kind == primitive_kind::sum); } - bool extra_ok() const { - return src_md()->extra.flags == 0 && dst_md()->extra.flags == 0; + bool extra_ok(bool accept_conv_asymm = false) const { + if (!accept_conv_asymm) + return (src_md()->extra.flags == memory_extra_flags::none) + && (dst_md()->extra.flags == memory_extra_flags::none); + return check_md_extra_flags_compensation_gpu(src_md()->extra.flags) + && check_md_extra_flags_compensation_gpu(dst_md()->extra.flags); } + + status_t maybe_create_zp_precompute_conv_pd(impl::engine_t *dst_engine); + +public: + status_t maybe_create_zp_precompute_conv( + std::shared_ptr &zp_precomp_conv, + impl::engine_t *engine, gpu::primitive_t *primitive) const; + + status_t maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx, + const std::shared_ptr &zp_precomp_conv) const; + +private: + bool do_zp_precomp_conv_ = false; + std::shared_ptr zp_precomp_conv_pd_; }; } // namespace gpu diff --git a/src/gpu/gpu_utils.hpp b/src/gpu/gpu_utils.hpp index 18c82b1dccc..fe56ccaba41 100644 --- a/src/gpu/gpu_utils.hpp +++ b/src/gpu/gpu_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/gpu/gpu_zero_points_conv.cpp b/src/gpu/gpu_zero_points_conv.cpp new file mode 100644 index 00000000000..0e1edb567d9 --- /dev/null +++ b/src/gpu/gpu_zero_points_conv.cpp @@ -0,0 +1,96 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "common/convolution_pd.hpp" +#include "common/primitive_desc_iterator.hpp" +#include "gpu/gpu_zero_points_conv.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t create_zp_precompute_conv_pd(std::shared_ptr &retn, + dnnl::impl::engine_t *eng, const primitive_attr_t &attr, + const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw, + const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type, + prop_kind_t prop, bool has_offset0) { + using namespace memory_extra_flags; + auto real_wei = *wei; + const int off = (!idhw[1]) ? 2 + !idhw[2] : !idhw[0]; + const bool with_groups = (real_wei.ndims == (6 - off)); + if (real_wei.extra.flags & compensation_gpu_conv_asymmetric_src_swap) { + static_assert(DNNL_MAX_NDIMS == 12, "DNNL_MAX_NDIMS is not 12"); + std::array perm_grp + = {0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + std::array perm_no_grp + = {1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + CHECK(memory_desc_permute_axes(real_wei, *wei, + (with_groups) ? perm_grp.data() : perm_no_grp.data())); + } + real_wei.extra = memory_extra_desc_t(); + + const auto &dims = real_wei.dims; + const bool is_fwd = ((prop == prop_kind::forward_training) + || (prop == prop_kind::forward_inference)); + const bool is_bwd_d = (prop == prop_kind::backward_data); + assert((off < 3) && (real_wei.ndims >= 5 - off) && (is_fwd || is_bwd_d)); + MAYBE_UNUSED(is_fwd); + + using memory_dims = std::vector; + memory_dims S1 {1, 1, 1}; + memory_dims P1 {0, 0, 0}; + // dim order for weights: [G,] OC, IC, [[[D,] H,] W] + memory_dims dims_in {1, + (with_groups) ? dims[0] * dims[2 - is_bwd_d] : dims[1 - is_bwd_d]}; + memory_dims dims_out {1, + (with_groups) ? dims[0] * dims[1 + is_bwd_d] : dims[0 + is_bwd_d]}; + for (int i = off; i < 3; i++) { + const auto k_idx = 2 + with_groups + i - off; + const auto KD = (dims[k_idx] - 1) * (ddhw[i] + 1) + 1; + dims_in.emplace_back(idhw[i]); + dims_out.emplace_back(odhw[i]); + P1[i] = dims_out.back() - dims_in.back() - 1 + KD - pdhw[i]; + } + + memory_desc_t in, out; + CHECK(memory_desc_init_by_tag(out, int(dims_out.size()), dims_out.data(), + out_type, format_tag::any)); + CHECK(memory_desc_init_by_tag(in, int(dims_in.size()), dims_in.data(), + data_type::s8, format_tag::any)); + + if (has_offset0) { + auto out_type_size = types::data_type_size(out_type); + auto offset0 = memory_desc_wrapper(real_wei).size(0, false); + assert(offset0 % out_type_size == 0); + out.offset0 = offset0 / out_type_size; + } + auto conv_desc = convolution_desc_t(); + CHECK(dnnl::impl::conv_desc_init(&conv_desc, prop, + alg_kind::convolution_direct, (is_bwd_d) ? &out : &in, &real_wei, + nullptr, (is_bwd_d) ? &in : &out, S1.data() + off, ddhw + off, + pdhw + off, P1.data() + off)); + primitive_desc_iterator_t it(eng, (op_desc_t *)&conv_desc, &attr, nullptr); + if (!it.is_initialized()) return status::out_of_memory; + retn = *(++it); + return (retn) ? status::success : status::unimplemented; +} + +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/gpu_zero_points_conv.hpp b/src/gpu/gpu_zero_points_conv.hpp new file mode 100644 index 00000000000..e287454b4ec --- /dev/null +++ b/src/gpu/gpu_zero_points_conv.hpp @@ -0,0 +1,36 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_GPU_ZERO_POINTS_CONV_HPP +#define GPU_GPU_ZERO_POINTS_CONV_HPP + +#include "common/primitive_desc.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { + +status_t create_zp_precompute_conv_pd(std::shared_ptr &retn, + dnnl::impl::engine_t *eng, const primitive_attr_t &attr, + const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw, + const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type, + prop_kind_t prop, bool has_offset0 = true); + +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp index 150918d9a9c..9a206c11554 100644 --- a/src/gpu/intel/jit/codegen/kernel.hpp +++ b/src/gpu/intel/jit/codegen/kernel.hpp @@ -28,6 +28,8 @@ #include "gpu/intel/compute/utils.hpp" #include "gpu/intel/jit/codegen/operand.hpp" #include "gpu/intel/jit/codegen/register_allocator.hpp" +#include "gpu/intel/jit/codegen/register_scope.hpp" +#include "gpu/intel/jit/codegen/reorder.hpp" #include "gpu/intel/jit/emulation.hpp" #include "gpu/intel/jit/ir/ir.hpp" #include "gpu/intel/jit/ir/ir_builder.hpp" @@ -591,20 +593,28 @@ class ir_kernel_t : public jit_generator { } void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, - const ngen_operand_t &src0, const ngen_operand_t &src1, - const ngen_operand_t &src2) { + const ngen_operand_t &_src0, const ngen_operand_t &_src1, + const ngen_operand_t &_src2) { + auto src0 = _src0; + auto src1 = _src1; + auto src2 = _src2; + auto scope = ngen_register_scope_t(ra_); + align_src_dst_offset(this, scope, mod, dst, src0); + align_src_dst_offset(this, scope, mod, dst, src1); if (hw >= ngen::HW::XeHP) { if (src2.is_reg_data()) { - add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.reg_data()); + align_src_dst_offset(this, scope, mod, dst, src2); + add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.reg_data()); } else { - add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.immediate()); + add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } return; } add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); if (src2.is_reg_data()) { + align_src_dst_offset(this, scope, mod, dst, src2); add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data()); } else { add(mod, dst.reg_data(), dst.reg_data(), src2.immediate()); @@ -612,26 +622,34 @@ class ir_kernel_t : public jit_generator { } void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, - const ngen_operand_t &src0, const ngen_operand_t &src1, - const ngen_operand_t &src2) { + const ngen_operand_t &_src0, const ngen_operand_t &_src1, + const ngen_operand_t &_src2) { + auto src0 = _src0; + auto src1 = _src1; + auto src2 = _src2; + auto scope = ngen_register_scope_t(ra_); + align_src_dst_offset(this, scope, mod, dst, src1); if (src2.is_reg_data()) { - mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.reg_data()); + align_src_dst_offset(this, scope, mod, dst, src0); + align_src_dst_offset(this, scope, mod, dst, src2); + mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.reg_data()); } else if (hw < ngen::HW::XeLP) { + align_src_dst_offset(this, scope, mod, dst, src0); mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate()); add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data()); } else if (src0.is_immediate() && (ngen_is_dw(src0.type()) || src0.type() == ngen::DataType::uw)) { // dword immediate src0 is not supported, move to a register. - auto tmp_src0 = ra_.alloc_sub(src0.type()); + auto tmp_src0 = scope.alloc_sub(src0.type()); mov(1, tmp_src0, src0.immediate()); - mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(), - src2.immediate()); - ra_.safeRelease(tmp_src0); + mad(mod, dst.reg_data(), tmp_src0, + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } else { - mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), - src2.immediate()); + align_src_dst_offset(this, scope, mod, dst, src0); + mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()), + fixup_ternary_rgn(src1.reg_data()), src2.immediate()); } } @@ -1144,6 +1162,13 @@ class ir_kernel_t : public jit_generator { return ir_utils::safe_divide(local_size, exec_cfg_.simd()); } + static ngen::RegData fixup_ternary_rgn(const ngen::RegData &r) { + ngen::RegData retn = r; + return ((retn.getHS() == 1) && (retn.getVS() == retn.getWidth())) + ? retn.setRegion(1, 1, 0) + : retn; + } + kernel_iface_t kernel_iface_; std::string kernel_name_; exec_config_t exec_cfg_; diff --git a/src/gpu/intel/jit/codegen/reorder.hpp b/src/gpu/intel/jit/codegen/reorder.hpp index 12d2187c8d1..aa4bc370794 100644 --- a/src/gpu/intel/jit/codegen/reorder.hpp +++ b/src/gpu/intel/jit/codegen/reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1300,15 +1300,17 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope, int dst_off = dst.offset(); int src_byte_off = src.byte_offset(); int dst_byte_off = dst.byte_offset(); + int esize = mod.getExecSize(); + int grf_size = ngen::GRF::bytes(scope.hw()); + int grf_src = grf_size / src.hs(); + int grf_dst = grf_size / dst.hs(); // If src is aligned with dst, return. - if ((is_xf || is_bf_to_f) && src_off == dst_off) return; - if (!is_xf && src_byte_off == dst_byte_off) return; + if ((is_xf || is_bf_to_f) && src_off % grf_src == dst_off % grf_dst) return; + if (!is_xf && src_byte_off % grf_size == dst_byte_off % grf_size) return; int new_src_byte_off = (is_xf ? dst_off * src_type_size : dst_byte_off); - int esize = mod.getExecSize(); - int grf_size = ngen::GRF::bytes(scope.hw()); int src_size = std::max(src_type_size * esize * src_stride, src_type_size); auto new_src = scope.alloc_reg_buf_data( diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp index 08bbb4e4166..fe36059d3e0 100644 --- a/src/gpu/intel/jit/conv/config.cpp +++ b/src/gpu/intel/jit/conv/config.cpp @@ -20,6 +20,7 @@ #include #include +#include "common/utils.hpp" #include "gpu/intel/jit/conv/grf_usage.hpp" #include "gpu/intel/jit/conv/message_patterns.hpp" #include "gpu/intel/jit/conv/normalization.hpp" @@ -659,6 +660,61 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md, if (user_dst_req == "user") dst_tag = user_dst_tag = "user"; } +void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw, + dim_t *odhw, dim_t *pdhw, dim_t *ddhw) { + const bool is_bwd_d = (prb.prop_kind() == prop_kind::backward_data); + using memory_dims = std::vector; + memory_dims I {prb.id, prb.ih, prb.iw}; + memory_dims O {prb.od, prb.oh, prb.ow}; + memory_dims K {prb.kd, prb.kh, prb.kw}; + memory_dims S {prb.sd, prb.sh, prb.sw}; + memory_dims D {prb.dd, prb.dh, prb.dw}; + memory_dims P {prb.pd, prb.ph, prb.pw}; + const int off = 5 - prb.ndims; + const auto *w = prb.conv_pd->weights_md(); + + // restore the original layout of the prb values + const auto *s + = (is_bwd_d) ? prb.conv_pd->diff_dst_md() : prb.conv_pd->src_md(); + const auto *d + = (is_bwd_d) ? prb.conv_pd->diff_src_md() : prb.conv_pd->dst_md(); + auto has_dim = [&](int i) { + return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1) + || (w->dims[2 + i + prb.with_groups] > 1); + }; + auto move_back = [&](int i, int off) { + if (off == 0) return; + I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1; + D[i - off] = P[i - off] = 0; + std::swap(I[i - off], I[i]); + std::swap(O[i - off], O[i]); + std::swap(K[i - off], K[i]); + std::swap(S[i - off], S[i]); + std::swap(D[i - off], D[i]); + std::swap(P[i - off], P[i]); + }; + bool has_d = (off <= 0) && has_dim(0 - off); + bool has_h = (off <= 1) && has_dim(1 - off); + bool has_w = (off <= 2) && has_dim(2 - off); + if (!has_d && !has_h && !has_w) has_w = true; + move_back(1, has_d * (!has_h == has_w)); + move_back(2, !has_w * (!has_h + 1)); + + for (int i = off; i < int(K.size()); i++) { + const auto KD = (K[i] - 1) * (D[i] + 1) + 1; + ir_assert(w->dims[2 + i + prb.with_groups - off] == K[i]); + O[i] = ir_utils::max_unique_pad_states( + O[i], I[i], KD, P[i], S[i], true); + I[i] = std::min(KD, I[i]); + } + for (int i = 0; i < 3; i++) { + idhw[i] = (i < off) ? 0 : I[i]; + odhw[i] = (i < off) ? 0 : O[i]; + pdhw[i] = (i < off) ? 0 : P[i]; + ddhw[i] = (i < off) ? 0 : D[i]; + } +} + status_t init_tensor_layouts( conv_config_t &cfg, convolution_pd_t *pd, impl::engine_t *engine) { const auto &prb = cfg.prb(); @@ -778,6 +834,27 @@ status_t init_tensor_layouts( bia.set_compute(bia_layout); bia.set_user(user_bia_layout); + if (cfg.zp_cfg().needs_src_reorder_precalc) { + auto get_channels = [](const layout_t &layout) { + const dim_t min_esize = 16; + return std::max(utils::rnd_up_pow2(layout.dim(1) * layout.dim(2)), + min_esize); + }; + using namespace memory_extra_flags; + prepare_zp_precompute_conv(prb, wei_md.extra.idhw, wei_md.extra.odhw, + wei_md.extra.pdhw, wei_md.extra.ddhw); + + wei_md.extra.dst_size = sizeof(float); + for (const auto &o : wei_md.extra.odhw) + wei_md.extra.dst_size *= std::max(o, dim_t(1)); + if (prb.prop_kind() == prop_kind::backward_data) { + wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src_bwd; + wei_md.extra.dst_size *= get_channels(src_layout); + } else { + wei_md.extra.dst_size *= get_channels(dst_layout); + } + wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src; + } return status::success; } diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp index f698d7ab546..b20e7240889 100644 --- a/src/gpu/intel/jit/conv/config.hpp +++ b/src/gpu/intel/jit/conv/config.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -675,6 +675,8 @@ int default_regs(const conv_config_t &cfg); void init_kernel_grid(conv_config_t &cfg); void init_walk_order(conv_config_t &cfg); void init_thread_group_grid(conv_config_t &cfg); +void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw, + dim_t *odhw, dim_t *pdhw, dim_t *ddhw); std::array get_kernel_grid_conv_dims(const conv_config_t &cfg); std::array get_thread_group_grid_conv_dims( const conv_config_t &cfg); diff --git a/src/gpu/intel/jit/conv/gen_convolution.cpp b/src/gpu/intel/jit/conv/gen_convolution.cpp index 3719f00f371..a292f27c98c 100644 --- a/src/gpu/intel/jit/conv/gen_convolution.cpp +++ b/src/gpu/intel/jit/conv/gen_convolution.cpp @@ -25,6 +25,7 @@ #include "common/impl_registration.hpp" #include "common/utils.hpp" #include "common/verbose.hpp" +#include "gpu/gpu_zero_points_conv.hpp" #include "gpu/intel/jit/ir/kernel_info.hpp" #include "gpu/intel/jit/reorder/reorder_kernel.hpp" #include "gpu/intel/jit/utils/utils.hpp" @@ -45,8 +46,7 @@ struct conv_pd_data_t { conv_config_t pd_cfg; tensor_config_t tensor_cfg; std::vector kernel_infos; - std::shared_ptr zp_pd; - std::shared_ptr zp_prim; + std::shared_ptr zp_pd; }; class gen_convolution_t { @@ -72,79 +72,31 @@ class gen_convolution_t { CHECK(init_pd_time_cfg( prb, pd->data->pd_cfg, engine, pd, &pd->attr_)); - if (pd->data->pd_cfg.zp_cfg().needs_src_precalc) { - memory::dims I {prb.id, prb.ih, prb.iw}; - memory::dims O {prb.od, prb.oh, prb.ow}; - memory::dims K {prb.kd, prb.kh, prb.kw}; - memory::dims S {prb.sd, prb.sh, prb.sw}; - memory::dims D {prb.dd, prb.dh, prb.dw}; - memory::dims P {prb.pd, prb.ph, prb.pw}; - const int off = 5 - prb.ndims; - const auto *w = pd->invariant_wei_md(); - { // restore the original layout of the prb values - const auto *s = pd->invariant_src_md(); - const auto *d = pd->invariant_dst_md(); - auto has_dim = [&](int i) { - return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1) - || (w->dims[2 + i + prb.with_groups] > 1); - }; - auto move_back = [&](int i, int off) { - if (off == 0) return; - I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1; - D[i - off] = P[i - off] = 0; - std::swap(I[i - off], I[i]); - std::swap(O[i - off], O[i]); - std::swap(K[i - off], K[i]); - std::swap(S[i - off], S[i]); - std::swap(D[i - off], D[i]); - std::swap(P[i - off], P[i]); - }; - bool has_d = (off <= 0) && has_dim(0 - off); - bool has_h = (off <= 1) && has_dim(1 - off); - bool has_w = (off <= 2) && has_dim(2 - off); - if (!has_d && !has_h && !has_w) has_w = true; - move_back(1, has_d * (!has_h == has_w)); - move_back(2, !has_w * (!has_h + 1)); + if (pd->data->pd_cfg.zp_cfg().needs_src_reorder_precalc + || pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + primitive_attr_t attr; + if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + int mask = 0; + CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask)); + attr.zero_points_.set(DNNL_ARG_SRC, mask); + attr.post_ops_.append_eltwise( + 1.f, alg_kind::eltwise_linear, -1.f, 0.f); } - memory::dims S1 {1, 1, 1}; - memory::dims P1 {0, 0, 0}; - memory::dims dims_src {1, dim_t(prb.g) * prb.ic}; - memory::dims dims_dst {1, dim_t(prb.g) * prb.oc}; - - for (int i = off; i < int(K.size()); i++) { - const auto KD = (K[i] - 1) * (D[i] + 1) + 1; - dims_src.emplace_back(std::min(KD, I[i])); - dims_dst.emplace_back(ir_utils::max_unique_pad_states( - O[i], I[i], KD, P[i], S[i], true)); - P1[i] = dims_dst.back() - dims_src.back() - 1 + KD - P[i]; + dim_t I[3], O[3], P[3], D[3]; + prepare_zp_precompute_conv(prb, I, O, P, D); + CHECK(create_zp_precompute_conv_pd(pd->data->zp_pd, engine, + attr, pd->weights_md(), I, O, P, D, data_type::f32, + pd->get_prop_kind(), + !pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc)); + if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) { + auto scratchpad = pd->scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_nested_multiple, + pd->data->zp_pd->scratchpad_registry()); } - memory::desc src(dims_src, memory::data_type::s8, - memory::format_tag::any); - memory::desc dst(dims_dst, memory::data_type::s32, - memory::format_tag::any); - - // create a nested conv and allocate a nested scratchpad for it - primitive_attr_t attr; - int mask = 0; - CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask)); - attr.zero_points_.set(DNNL_ARG_SRC, mask); - attr.post_ops_.append_eltwise( - 1.f, alg_kind_t::dnnl_eltwise_linear, -1.f, 0.f); - dnnl_primitive_desc *zp_pd; - CHECK(dnnl_convolution_forward_primitive_desc_create(&zp_pd, - engine, dnnl_prop_kind_t::dnnl_forward_inference, - dnnl_alg_kind_t::dnnl_convolution_direct, src.get(), w, - nullptr, dst.get(), S1.data() + off, D.data() + off, - P.data() + off, P1.data() + off, &attr)); - pd->data->zp_pd.reset(zp_pd, dnnl_primitive_desc_destroy); - auto scratchpad = pd->scratchpad_registry().registrar(); - scratchpad.book(memory_tracking::names::key_nested_multiple, - pd->data->zp_pd->impl()->scratchpad_registry()); } - pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg, - (pd->data->zp_pd) ? pd->data->zp_pd->impl()->src_md() - : nullptr); + pd->data->tensor_cfg = get_tensor_config( + pd->data->pd_cfg, zp_conv_md_in(*pd->data)); pd->data->kernel_infos.reserve(max_kernels); CHECK(init_kernel_infos(pd)); @@ -176,7 +128,7 @@ class gen_convolution_t { int max_tries = 100; conv_config_t cfg; layout_t zp_dst; - if (data.zp_pd) zp_dst = layout_t(data.zp_pd->impl()->dst_md(), false); + if (data.zp_pd) zp_dst = layout_t(zp_conv_md_out(data), false); if (primitive->cache_blob()) { tiler->set_cur_version(primitive->version()); @@ -198,8 +150,17 @@ class gen_convolution_t { ir_info() << cfg; init_nd_ranges(primitive, cfg); - auto &kernel_infos = data.kernel_infos; + + // This absolutely HAS to be executed first if present, + // since it adds its own version mark to the cache blob + for (int i = 0; i < int(kernel_infos.size()); i++) + if (kernel_infos[i].id() == kernel_id_t::zp_precalc) { + ir_assert(data.zp_pd); + CHECK(primitive->create_nested_primitive( + zp_prim_, data.zp_pd, engine)); + } + std::vector tmp_kernels; for (int i = 0; i < int(kernel_infos.size()); i++) { auto &info = kernel_infos[i]; @@ -248,10 +209,6 @@ class gen_convolution_t { break; case kernel_id_t::zp_precalc: - ir_assert(data.zp_pd); - if (!data.zp_prim) - CHECK(data.zp_pd->impl()->create_primitive( - data.zp_prim, engine)); tmp_kernels.emplace_back(); continue; @@ -319,12 +276,11 @@ class gen_convolution_t { new memory_t(ctx.stream()->engine(), md, std::move(s))); }; - ir_assert(data.zp_prim); + ir_assert(zp_prim_); std::unique_ptr zp_src, zp_dst; - CHECK(scratchpad_arg(zp_src, "src_zero_points", - data.zp_pd->impl()->src_md())); CHECK(scratchpad_arg( - zp_dst, "dst", data.zp_pd->impl()->dst_md())); + zp_src, "src_zero_points", zp_conv_md_in(data))); + CHECK(scratchpad_arg(zp_dst, "dst", zp_conv_md_out(data))); exec_args_t e_args; auto src_zp_idx = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC; @@ -334,9 +290,9 @@ class gen_convolution_t { e_args[DNNL_ARG_DST] = memory_arg_t {zp_dst.get(), false}; exec_ctx_t e_ctx(ctx, std::move(e_args)); const auto nm = memory_tracking::names::key_nested_multiple; - nested_scratchpad_t ns(ctx, nm, data.zp_prim); + nested_scratchpad_t ns(ctx, nm, zp_prim_); e_ctx.set_scratchpad_grantor(ns.grantor()); - CHECK(data.zp_prim->execute(e_ctx)); + CHECK(zp_prim_->execute(e_ctx)); } nsubmitted++; if (nsubmitted == nkernels) break; @@ -347,6 +303,20 @@ class gen_convolution_t { } private: + static const memory_desc_t *zp_conv_md_in(const conv_pd_data_t &data) { + if (!data.zp_pd) return nullptr; + const bool is_bwd_d + = (data.zp_pd->get_prop_kind() == prop_kind::backward_data); + return (is_bwd_d) ? data.zp_pd->diff_dst_md() : data.zp_pd->src_md(); + } + + static const memory_desc_t *zp_conv_md_out(const conv_pd_data_t &data) { + if (!data.zp_pd) return nullptr; + const bool is_bwd_d + = (data.zp_pd->get_prop_kind() == prop_kind::backward_data); + return (is_bwd_d) ? data.zp_pd->diff_src_md() : data.zp_pd->dst_md(); + } + template static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) { auto &infos = pd->data->kernel_infos; @@ -361,10 +331,8 @@ class gen_convolution_t { static status_t init_kernel_infos(T *pd) { auto &data = *pd->data; auto &cfg = data.pd_cfg; - const bool needs_zp_precalc = cfg.zp_cfg().needs_src_precalc; - auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution); - auto &zp_precalc_info = (needs_zp_precalc) + auto &zp_precalc_info = (cfg.zp_cfg().needs_src_conv_precalc) ? create_kernel_info(pd, kernel_id_t::zp_precalc) : conv_info; @@ -374,8 +342,10 @@ class gen_convolution_t { // Initialize kernel arguments. int scratchpad_key = memory_tracking::names::key_none; for (auto &t : data.tensor_cfg.tensors()) { - const bool src_zp_precalc - = needs_zp_precalc && (t.name == "src_zero_points"); + const bool wei_reorder_precalc = (t.name == "wei") + && cfg.zp_cfg().needs_src_reorder_precalc; + const bool src_conv_precalc = (t.name == "src_zero_points") + && cfg.zp_cfg().needs_src_conv_precalc; const auto compute_buf = make_buffer(t.name); size_t compute_size = t.compute_layout.size(); @@ -390,7 +360,7 @@ class gen_convolution_t { auto add_compute_arg = [&](kernel_info_t &ki, const expr_t &buf, bool is_input) { - if (t.needs_reorder || src_zp_precalc) + if (t.needs_reorder || src_conv_precalc) ki.register_scratchpad_arg( buf, compute_arg_key, is_input, compute_size); else @@ -411,12 +381,12 @@ class gen_convolution_t { return zero_out_info; }; - if (t.needs_reorder || src_zp_precalc) { + if (t.needs_reorder || src_conv_precalc) { int user_arg_key = compute_arg_key; auto user_buf = make_buffer(t.name + "_user"); compute_arg_key = ++scratchpad_key; - if (!src_zp_precalc && t.is_input) { + if (!src_conv_precalc && t.is_input) { auto &reorder_info = create_kernel_info(pd, kernel_id_t::pre_reorder); reorder_info.register_user_arg(user_buf, user_arg_key, @@ -425,7 +395,7 @@ class gen_convolution_t { reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( cfg.exec_cfg(), t.user_layout, t.compute_layout)); } - if (!src_zp_precalc && t.is_output) { + if (!src_conv_precalc && t.is_output) { auto &reorder_info = create_kernel_info(pd, kernel_id_t::post_reorder); add_compute_arg(reorder_info, compute_buf, true); @@ -434,7 +404,7 @@ class gen_convolution_t { reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( cfg.exec_cfg(), t.compute_layout, t.user_layout)); } - if (src_zp_precalc) { + if (src_conv_precalc) { scratchpad_book(++scratchpad_key); create_zero_out_info().register_scratchpad_arg(compute_buf, scratchpad_key, /*is_input=*/false, compute_size); @@ -456,6 +426,12 @@ class gen_convolution_t { add_compute_arg(zp_precalc_info, make_buffer("dst"), false); } scratchpad_book(compute_arg_key); + if (wei_reorder_precalc) { + // user-supplied weights contain precomputed ZP values, so + // the buffer is to be passed to the conv alongside weights + conv_info.register_user_arg( + user_buf, user_arg_key, t.is_input && !t.is_output); + } } if (t.needs_zero_out) { add_compute_arg(create_zero_out_info(), compute_buf, false); @@ -512,6 +488,7 @@ class gen_convolution_t { std::vector kernels_; std::vector nd_ranges_; + std::shared_ptr zp_prim_; }; status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) { diff --git a/src/gpu/intel/jit/conv/normalization.cpp b/src/gpu/intel/jit/conv/normalization.cpp index 71dabfe229d..e1914e3eba3 100644 --- a/src/gpu/intel/jit/conv/normalization.cpp +++ b/src/gpu/intel/jit/conv/normalization.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -175,9 +175,11 @@ void maybe_reshape_dims(dim_idx_t ndims, layout_t &layout, // this method only gets called when ZP precompute is in order; // in all other cases ZPs are applied ad-hoc, without a post-op view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { - auto map_o2k = [](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t KD, - dim_t P, dim_t S) { - const bool needs_right_bound = ((O - 1) * S + (KD - P) >= I); + auto map_o2k = [this](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t K, + dim_t D, dim_t P, dim_t S) { + const auto KD = (K - 1) * (D + 1) + 1; + const auto KDP = (KD > 1) ? KD - P : 0; + const bool needs_right_bound = (O - 1) * S + KDP >= I; expr_t o = v.vvars()[idx]; if (KD >= I) { o = o * S; @@ -186,7 +188,13 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { dim_t off = P; if (P > 0) l = binary_op_t::make(op_kind_t::_min, o * S - P, 0); if (needs_right_bound) { - r = binary_op_t::make(op_kind_t::_max, o * S + (KD - P), I); + if (schedule_.var_bound(o) > O) { + auto q = binary_op_t::make( + op_kind_t::_min, o * S + KDP, (O - 1) * S + KDP); + r = binary_op_t::make(op_kind_t::_max, q, I); + } else { + r = binary_op_t::make(op_kind_t::_max, o * S + KDP, I); + } off -= I; } o = (!l.is_empty()) ? l : o; @@ -218,9 +226,6 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { } dst = layout_t(dst.type(), dst.ndims(), dst.offset(), new_blk, false); - const auto KDD = (prb_.kd - 1) * (prb_.dd + 1) + 1; - const auto KDH = (prb_.kh - 1) * (prb_.dh + 1) + 1; - const auto KDW = (prb_.kw - 1) * (prb_.dw + 1) + 1; view_t view(vars, 6); view.set_vdim(vars[0], 1); // mb view.set_vdim(vars[1], prb_.g); @@ -228,9 +233,9 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const { view.set_tdim(0, vars[0]); view.set_tdim(1, vars[1]); view.set_tdim(2, vars[2]); - map_o2k(view, 3, prb_.od, prb_.id, KDD, prb_.pd, prb_.sd); - map_o2k(view, 4, prb_.oh, prb_.ih, KDH, prb_.ph, prb_.sh); - map_o2k(view, 5, prb_.ow, prb_.iw, KDW, prb_.pw, prb_.sw); + map_o2k(view, 3, prb_.od, prb_.id, prb_.kd, prb_.dd, prb_.pd, prb_.sd); + map_o2k(view, 4, prb_.oh, prb_.ih, prb_.kh, prb_.dh, prb_.ph, prb_.sh); + map_o2k(view, 5, prb_.ow, prb_.iw, prb_.kw, prb_.dw, prb_.pw, prb_.sw); view.set_tlayout(dst); return view; } diff --git a/src/gpu/intel/jit/conv/normalization.hpp b/src/gpu/intel/jit/conv/normalization.hpp index 7e49a4c4a2c..cf926487376 100644 --- a/src/gpu/intel/jit/conv/normalization.hpp +++ b/src/gpu/intel/jit/conv/normalization.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,8 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t { const conv_problem_t &prb, const zero_points_config_t &zp_cfg, const layout_t &zp_dst) : post_op_view_mapper_t(schedule.c_view()) - , has_external_src_zps_(zp_cfg.needs_src_precalc) + , has_external_src_zps_(zp_cfg.needs_src_conv_precalc + || zp_cfg.needs_src_reorder_precalc) , schedule_(schedule) , prb_(prb) , zp_dst_(zp_dst) {} diff --git a/src/gpu/intel/jit/conv/zp_plan.cpp b/src/gpu/intel/jit/conv/zp_plan.cpp index f65eefcb28f..13f7aae303a 100644 --- a/src/gpu/intel/jit/conv/zp_plan.cpp +++ b/src/gpu/intel/jit/conv/zp_plan.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1446,7 +1446,6 @@ class zp_comp_apply_plan_t : public base_plan_t { struct zp_plan_impl_t : public base_plan_t { bool src_2d_loads = false; - bool needs_precalc = false; bool has_dpasw = false; split_dispatcher_t sd; send_plan_t load; @@ -1526,8 +1525,9 @@ void zp_plan_t::init(const conv_config_t &cfg, bool src_2d_loads, const layout_t &wei_layout, const layout_t &dst_layout) { impl->src_2d_loads = src_2d_loads; impl->has_dpasw = cfg.fma_kind() == fma_kind_t::dpasw; - impl->needs_precalc = cfg.zp_cfg().needs_src_precalc; - bool do_src = cfg.zp_cfg().do_src_compensation && !impl->needs_precalc; + bool do_src = cfg.zp_cfg().do_src_compensation + && !cfg.zp_cfg().needs_src_reorder_precalc + && !cfg.zp_cfg().needs_src_conv_precalc; bool do_wei = cfg.zp_cfg().do_wei_compensation; send_plan_t impl_load; @@ -1574,10 +1574,6 @@ bool zp_plan_t::has_zp_wei() const { return impl->has_zp_wei(); } -bool zp_plan_t::needs_precalc() const { - return impl->needs_precalc; -} - int zp_plan_t::load_reg_buf_size() const { return impl->load.reg_buf_size(); } diff --git a/src/gpu/intel/jit/conv/zp_plan.hpp b/src/gpu/intel/jit/conv/zp_plan.hpp index 14ec03419fd..267de03422f 100644 --- a/src/gpu/intel/jit/conv/zp_plan.hpp +++ b/src/gpu/intel/jit/conv/zp_plan.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,6 @@ struct zp_plan_t : public base_plan_t { bool is_src_precomp_compatible() const; bool has_zp_src() const; bool has_zp_wei() const; - bool needs_precalc() const; int load_reg_buf_size() const; int mask_reg_buf_size() const; int comp_reg_buf_size() const; diff --git a/src/gpu/intel/jit/ir/epilogue.cpp b/src/gpu/intel/jit/ir/epilogue.cpp index d0f910e586b..315ce632dda 100644 --- a/src/gpu/intel/jit/ir/epilogue.cpp +++ b/src/gpu/intel/jit/ir/epilogue.cpp @@ -278,6 +278,9 @@ class post_op_tensor_t { stmt_t build_prefetch_stmt(const view_t &c_view) const { ir_assert(needs_load()); + // Disable prefetching for precomputed ZPs stored at the end of 'wei' + if ((mem_buf().str() == "wei") || (mem_buf().str() == "wei_user")) + return stmt_t(); auto prefetch = make_access_builder(*ir_ctx_, mem_view(), mem_buf(), expr_t(), send_op_t::prefetch, send_address_t::a64, get_cache_hint(c_view)); diff --git a/src/gpu/intel/jit/ir/kernel_info.hpp b/src/gpu/intel/jit/ir/kernel_info.hpp index 86390264760..56700b72bf9 100644 --- a/src/gpu/intel/jit/ir/kernel_info.hpp +++ b/src/gpu/intel/jit/ir/kernel_info.hpp @@ -144,11 +144,11 @@ class kernel_info_t { // Returns stage ID, kernels with smaller stage IDs are executed first. int stage_id() const { switch (id()) { - case kernel_id_t::pre_reorder: return 0; case kernel_id_t::zero_out: return 0; case kernel_id_t::zp_precalc: return 1; - case kernel_id_t::convolution: return 2; - case kernel_id_t::post_reorder: return 3; + case kernel_id_t::pre_reorder: return 2; + case kernel_id_t::convolution: return 3; + case kernel_id_t::post_reorder: return 4; default: ir_error_not_expected(); } return -1; diff --git a/src/gpu/intel/jit/ir/post_ops.cpp b/src/gpu/intel/jit/ir/post_ops.cpp index e10294ef5d1..a0dd20f7ad8 100644 --- a/src/gpu/intel/jit/ir/post_ops.cpp +++ b/src/gpu/intel/jit/ir/post_ops.cpp @@ -108,12 +108,27 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr, if (po_vm_.can_use_simple_src_zps() && zp_cfg.do_src_compensation) { if (zp_cfg.is_runtime_src_zero_points) { - bool per_oc = !zp_cfg.is_common_src_zero_point - || zp_cfg.needs_src_precalc; - auto view = po_vm_.create_src_zp_view((per_oc) ? 1 << 1 : 0); + auto view = po_vm_.create_src_zp_view( + (!zp_cfg.is_common_src_zero_point) ? 1 << 1 : 0); auto buf = kernel_info.find_arg("src_zero_points"); - auto in = add_input_tensor(view, buf); - post_ops_.emplace_back(c, c - in); + if (zp_cfg.needs_src_reorder_precalc) { + auto wei = kernel_info.find_arg("wei_user", true); + if (wei.is_empty()) wei = kernel_info.find_arg("wei"); + + layout_t tlayout(view.tlayout()); + tlayout.set_offset( + utils::div_up(schedule.b_view().tlayout().size(), + tlayout.type().size())); + view.set_tlayout(tlayout); + layout_t scalar(zp_cfg.src_zp_type, 0, + std::vector(view.vvars().size(), 1), false); + auto zp = add_input_tensor(view_t(scalar, view.vvars()), buf); + auto in = add_input_tensor(view, wei); + post_ops_.emplace_back(c, c - in * zp); + } else { + auto in = add_input_tensor(view, buf); + post_ops_.emplace_back(c, c - in); + } } else { auto func = eltwise_t::make(alg_kind::eltwise_linear, /*scale=*/1.f, diff --git a/src/gpu/intel/jit/ir/post_ops.hpp b/src/gpu/intel/jit/ir/post_ops.hpp index 84de8f0939d..72431dfc359 100644 --- a/src/gpu/intel/jit/ir/post_ops.hpp +++ b/src/gpu/intel/jit/ir/post_ops.hpp @@ -46,7 +46,8 @@ struct zero_points_config_t { bool is_common_src_zero_point = false; bool is_common_wei_zero_point = false; bool is_common_dst_zero_point = false; - bool needs_src_precalc = false; + bool needs_src_reorder_precalc = false; + bool needs_src_conv_precalc = false; int common_src_zero_point = 0; int common_wei_zero_point = 0; int common_dst_zero_point = 0; @@ -75,8 +76,10 @@ struct zero_points_config_t { pd && pd->attr()->zero_points_.common(DNNL_ARG_WEIGHTS)) , is_common_dst_zero_point( pd && pd->attr()->zero_points_.common(DNNL_ARG_DST)) - , needs_src_precalc( - pd && do_src_compensation && is_src_precalc_compatible(pd)) + , needs_src_reorder_precalc( + pd && do_src_compensation && can_use_src_reorder_precalc(pd)) + , needs_src_conv_precalc(pd && do_src_compensation + && !needs_src_reorder_precalc && can_use_src_conv_precalc(pd)) , common_src_zero_point(0) , common_wei_zero_point(0) , common_dst_zero_point(0) { @@ -102,12 +105,22 @@ struct zero_points_config_t { } private: - bool is_src_precalc_compatible(const primitive_desc_t *pd) { + bool can_use_src_reorder_precalc(const primitive_desc_t *pd) { if (pd->kind() != primitive_kind_t::dnnl_convolution) return false; - // In general, precomputed ZPs are slower than the regular ZPs up to a - // point where a nested convolution that does the precalc takes less - // time than the in-situ compensations; that usually happens around - // MB = 64, but the exact number is just a heuristic. + // Reorder-based precomputed ZPs are only available if the user did not + // specify the weights mem desc so the convolution can choose it freely + // and set a mem desc flag asking a reorder to precompute the values. + return (pd->invariant_wei_md()->format_kind == format_kind::any) + && pd->attr()->zero_points_.common(DNNL_ARG_SRC) + && pd->attr()->zero_points_.has_default_values( + DNNL_ARG_WEIGHTS); + } + bool can_use_src_conv_precalc(const primitive_desc_t *pd) { + if (pd->kind() != primitive_kind_t::dnnl_convolution) return false; + // In general, conv-based precomputed ZPs are slower than the regular + // ZPs up to a point where a nested convolution that does the precalc + // takes less time than the in-situ compensations; that usually happens + // around MB = 64, but the exact number is just a heuristic. // TODO: a finer-grained estimate return (pd->invariant_src_md()->dims[0] >= 64) && pd->attr()->zero_points_.has_default_values( diff --git a/src/gpu/intel/jit/ir/tensor_config.cpp b/src/gpu/intel/jit/ir/tensor_config.cpp index 20b8765df2b..a7c5e4f7c8d 100644 --- a/src/gpu/intel/jit/ir/tensor_config.cpp +++ b/src/gpu/intel/jit/ir/tensor_config.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,14 +38,14 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg, /*is_input=*/true, /*is_output=*/false, zp_layout); }; if (zp_cfg.do_src_compensation && zp_cfg.is_runtime_src_zero_points) { - if (!zp_cfg.needs_src_precalc) { - add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC, - (zp_cfg.is_common_src_zero_point) ? 1 : ic); - } else { + if (zp_cfg.needs_src_conv_precalc) { ir_assert(zp_src); int arg_key = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC; tensor_cfg.add_tensor("src_zero_points", arg_key, /*is_input=*/true, /*is_output=*/false, layout_t(zp_src, false), layout_t()); + } else { + add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC, + (zp_cfg.is_common_src_zero_point) ? 1 : ic); } } if (zp_cfg.do_wei_compensation && zp_cfg.is_runtime_wei_zero_points) { diff --git a/src/gpu/intel/jit/reorder/gen_reorder.cpp b/src/gpu/intel/jit/reorder/gen_reorder.cpp index 5f048447146..974b35210a8 100644 --- a/src/gpu/intel/jit/reorder/gen_reorder.cpp +++ b/src/gpu/intel/jit/reorder/gen_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,7 +98,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, | sm::rounding_mode; VDISPATCH_REORDER( attr()->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); + VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok"); VDISPATCH_REORDER(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP); VDISPATCH_REORDER(scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG); @@ -148,6 +148,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, check_layout(dst_layout), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst"); VDISPATCH_REORDER(compute_engine->mayiuse_ngen_kernels(), VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels"); + auto *gpu_attr = utils::downcast(attr()->gpu_attr_.get()); hw_t hw(engine); @@ -158,7 +159,8 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine, cfg->set_zp_cfg(zp_cfg); VDISPATCH_REORDER_SC( init_kernel_info(), "kernel initialization unsuccessful"); - + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); return status::success; } @@ -202,6 +204,9 @@ status_t gen_reorder_t::pd_t::init_kernel_info() { } status_t gen_reorder_t::init(impl::engine_t *engine) { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + auto &cfg = *pd()->cfg; auto &info = *pd()->kernel_info; @@ -221,6 +226,7 @@ status_t gen_reorder_t::execute(const exec_ctx_t &ctx) const { info.set_args(arg_list, storage_list); CHECK(parallel_for(ctx, info.nd_range(), kernel_, arg_list)); + CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_)); return status::success; } diff --git a/src/gpu/intel/jit/reorder/gen_reorder.hpp b/src/gpu/intel/jit/reorder/gen_reorder.hpp index c6aa048dfb3..478d5e030a4 100644 --- a/src/gpu/intel/jit/reorder/gen_reorder.hpp +++ b/src/gpu/intel/jit/reorder/gen_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ class gen_reorder_t : public gpu_primitive_t { const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } compute::kernel_t kernel_; + std::shared_ptr zp_precomp_conv_; }; } // namespace jit diff --git a/src/gpu/intel/ocl/ref_reorder.cpp b/src/gpu/intel/ocl/ref_reorder.cpp index e058b7091e4..edebcb3d305 100644 --- a/src/gpu/intel/ocl/ref_reorder.cpp +++ b/src/gpu/intel/ocl/ref_reorder.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -150,17 +150,19 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const { CHECK(large_parallel_for( ctx, nd_range, kernels_[0], arg_list, arg_list.nargs())); - if (!conf.subbyte_pack) return status::success; - - compute::kernel_arg_list_t repack_arg_list; - repack_arg_list.set(0, *tmp); - repack_arg_list.set(1, dst); - repack_arg_list.set(2, into(conf.nelems)); - repack_arg_list.set(3, 4); - compute::range_t repack_gws((conf.nelems * 4 + 7) / 8); - compute::nd_range_t repack_nd_range(repack_gws); - return large_parallel_for( - ctx, repack_nd_range, kernels_[1], repack_arg_list, 4); + if (conf.subbyte_pack) { + compute::kernel_arg_list_t repack_arg_list; + repack_arg_list.set(0, *tmp); + repack_arg_list.set(1, dst); + repack_arg_list.set(2, into(conf.nelems)); + repack_arg_list.set(3, 4); + compute::range_t repack_gws((conf.nelems * 4 + 7) / 8); + compute::nd_range_t repack_nd_range(repack_gws); + CHECK(large_parallel_for( + ctx, repack_nd_range, kernels_[1], repack_arg_list, 4)); + } + CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_)); + return status::success; } } // namespace ocl diff --git a/src/gpu/intel/ocl/ref_reorder.hpp b/src/gpu/intel/ocl/ref_reorder.hpp index 7b312fad53c..7feed0402f7 100644 --- a/src/gpu/intel/ocl/ref_reorder.hpp +++ b/src/gpu/intel/ocl/ref_reorder.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -121,8 +121,10 @@ struct ref_reorder_t : public gpu_primitive_t { VERBOSE_UNSUPPORTED_DT_CFG); VDISPATCH_REORDER_SC(init_conf(engine), "init_conf()"); - init_scratchpad(); + VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine), + "failed to create nested zp precompute convolution"); + init_scratchpad(); return status::success; } @@ -137,6 +139,9 @@ struct ref_reorder_t : public gpu_primitive_t { }; status_t init(impl::engine_t *engine) override { + CHECK(pd()->maybe_create_zp_precompute_conv( + zp_precomp_conv_, engine, this)); + compute::kernel_ctx_t kernel_ctx; auto status = pd()->init_kernel_ctx(kernel_ctx); @@ -161,6 +166,7 @@ struct ref_reorder_t : public gpu_primitive_t { private: const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } std::vector kernels_; + std::shared_ptr zp_precomp_conv_; }; } // namespace ocl