From 0993edf7df99e89d89b522a983336a78fda5c402 Mon Sep 17 00:00:00 2001
From: "Guskov, Andrey Y" <andrey.y.guskov@intel.com>
Date: Thu, 12 Dec 2024 15:53:16 -0800
Subject: [PATCH 1/2] src: common: make rnn_s8s8_compensation a power of 2

---
 src/common/memory_desc.hpp             | 14 ++++++-------
 src/common/memory_desc_wrapper.hpp     | 28 +++++++++-----------------
 src/common/primitive_hashing.cpp       |  8 +++-----
 src/common/serialization.cpp           | 10 +++------
 src/common/type_helpers.hpp            | 19 +++--------------
 src/cpu/rnn/rnn_reorders.hpp           | 11 ++--------
 src/gpu/intel/ocl/rnn/rnn_reorders.hpp | 12 +++--------
 7 files changed, 31 insertions(+), 71 deletions(-)

diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp
index 468a8528ec2..5dc820c67c1 100644
--- a/src/common/memory_desc.hpp
+++ b/src/common/memory_desc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ const rnn_packed_memory_format_t ldio_p = rnn_packed_memory_format_t::ldio_p;
 // TODO: convert to 'enum class'.
 // Flags for memory special features
 enum memory_extra_flags_t {
-    dnnl_memory_extra_flag_none = 0x0U,
+    dnnl_memory_extra_flag_none = 0u,
     // Indicates the weights have an additional buffer, that depends on the
     // @p compensation_mask.
     //
@@ -64,13 +64,13 @@ enum memory_extra_flags_t {
     // the additional buffer would consist of OC values:
     // O[oc : 0,OC] =
     //  -128 * SUM(ic : 0,IC; kh : 0,KH; kw : 0,KW){ weights(oc, ic, kh, kw) }
-    dnnl_memory_extra_flag_compensation_conv_s8s8 = 0x1U,
-    dnnl_memory_extra_flag_scale_adjust = 0x2U,
-    dnnl_memory_extra_flag_rnn_u8s8_compensation = 0x4U,
+    dnnl_memory_extra_flag_compensation_conv_s8s8 = 1u,
+    dnnl_memory_extra_flag_scale_adjust = 2u,
+    dnnl_memory_extra_flag_rnn_u8s8_compensation = 4u,
     dnnl_memory_extra_flag_gpu_rnn_u8s8_compensation
     = dnnl_memory_extra_flag_rnn_u8s8_compensation,
-    dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 0x8U,
-    dnnl_memory_extra_flag_rnn_s8s8_compensation = 0x16U,
+    dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u,
+    dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u,
 };
 
 // Create aliases for extra flags to preserve the old behavior.
diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp
index 5cf2e2f66ba..847951ba558 100644
--- a/src/common/memory_desc_wrapper.hpp
+++ b/src/common/memory_desc_wrapper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -149,9 +149,7 @@ struct memory_desc_wrapper : public c_compatible {
     size_t additional_buffer_data_size(uint64_t flag_select) const {
         using namespace memory_extra_flags;
         if (flag_select & compensation_conv_s8s8) return sizeof(int32_t);
-        if ((flag_select & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(flag_select))
-            return sizeof(float);
+        if (flag_select & rnn_u8s8_compensation) return sizeof(float);
         if (flag_select & compensation_conv_asymmetric_src)
             return sizeof(int32_t);
         return 0;
@@ -160,19 +158,16 @@ struct memory_desc_wrapper : public c_compatible {
     /** return true if memory format has additional buffer */
     bool is_additional_buffer() const {
         using namespace memory_extra_flags;
-        // Currently compensation is not required for rnn_s8s8_compensation,
-        // but it has common bit with rnn_u8s8_compensation constant so we have
-        // to exclude rnn_s8s8_compensation case explicitly
-        return ((extra().flags
-                        & (compensation_conv_s8s8 | rnn_u8s8_compensation
-                                | compensation_conv_asymmetric_src))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        extra().flags));
+        return extra().flags
+                & (compensation_conv_s8s8 | rnn_u8s8_compensation
+                        | compensation_conv_asymmetric_src);
     }
 
     /** returns the size required for a particular extra memory buffer */
     size_t additional_buffer_size(memory_extra_flags_t flag) const {
         using namespace memory_extra_flags;
+        const auto flags = extra().flags;
+        if (!(flags & flag)) return 0;
 
         const auto ndims = this->ndims();
         const auto &pdims = padded_dims();
@@ -186,18 +181,15 @@ struct memory_desc_wrapper : public c_compatible {
                       return (size_t)prod * buff_data_size;
                   };
 
-        if (extra().flags & compensation_conv_s8s8) {
+        if (flag == compensation_conv_s8s8) {
             return calculate_size(extra().compensation_mask,
                     additional_buffer_data_size(flag));
         }
-
-        if ((extra().flags & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        extra().flags)) {
+        if (flag == rnn_u8s8_compensation) {
             return calculate_size(extra().compensation_mask,
                     additional_buffer_data_size(flag));
         }
-        if (extra().flags & compensation_conv_asymmetric_src) {
+        if (flag == compensation_conv_asymmetric_src) {
             return calculate_size(extra().asymm_compensation_mask,
                     additional_buffer_data_size(flag));
         }
diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp
index 7c51d4d5de3..a8d9f25ce8c 100644
--- a/src/common/primitive_hashing.cpp
+++ b/src/common/primitive_hashing.cpp
@@ -190,11 +190,9 @@ size_t get_md_hash(const memory_desc_t &md) {
 
     if (md.extra.flags != dnnl_memory_extra_flag_none) {
         seed = hash_combine(seed, md.extra.flags);
-        if ((md.extra.flags
-                    & (dnnl_memory_extra_flag_compensation_conv_s8s8
-                            | dnnl_memory_extra_flag_rnn_u8s8_compensation))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        md.extra.flags)) {
+        if (md.extra.flags
+                & (dnnl_memory_extra_flag_compensation_conv_s8s8
+                        | dnnl_memory_extra_flag_rnn_u8s8_compensation)) {
             seed = hash_combine(seed, md.extra.compensation_mask);
         }
 
diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp
index fe43c2e2efc..8e40dd29819 100644
--- a/src/common/serialization.cpp
+++ b/src/common/serialization.cpp
@@ -120,18 +120,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) {
 
     if (md.extra.flags != dnnl_memory_extra_flag_none) {
         sstream.write(&md.extra.flags);
-        if ((md.extra.flags
-                    & (dnnl_memory_extra_flag_compensation_conv_s8s8
-                            | dnnl_memory_extra_flag_rnn_u8s8_compensation))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        md.extra.flags)) {
+        if (md.extra.flags
+                & (dnnl_memory_extra_flag_compensation_conv_s8s8
+                        | dnnl_memory_extra_flag_rnn_u8s8_compensation)) {
             sstream.write(&md.extra.compensation_mask);
         }
-
         if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) {
             sstream.write(&md.extra.scale_adjust);
         }
-
         if (md.extra.flags
                 & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
             sstream.write(&md.extra.asymm_compensation_mask);
diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp
index ef617b4731d..7a6efb9d986 100644
--- a/src/common/type_helpers.hpp
+++ b/src/common/type_helpers.hpp
@@ -299,28 +299,15 @@ inline format_kind_t format_tag_to_kind(format_tag_t tag) {
     return format_kind::undef;
 }
 
-// Currently rnn_s8s8_compensation has common bits with rnn_u8s8_compensation
-// and scale_adjust constants so we have to perform additional checks to
-// separate these two cases
-inline bool extra_flag_rnn_s8s8_compensation_is_set(uint64_t flags) {
-    return ((flags & memory_extra_flags::rnn_s8s8_compensation)
-                   ^ memory_extra_flags::rnn_s8s8_compensation)
-            == 0;
-}
-
 inline bool memory_extra_desc_is_equal(
         const memory_extra_desc_t &lhs, const memory_extra_desc_t &rhs) {
     using namespace memory_extra_flags;
-    return true && lhs.flags == rhs.flags
+    return lhs.flags == rhs.flags
             && IMPLICATION(lhs.flags & compensation_conv_s8s8,
                     lhs.compensation_mask == rhs.compensation_mask)
-            && IMPLICATION((lhs.flags & rnn_u8s8_compensation)
-                            && !extra_flag_rnn_s8s8_compensation_is_set(
-                                    lhs.flags),
+            && IMPLICATION(lhs.flags & rnn_u8s8_compensation,
                     lhs.compensation_mask == rhs.compensation_mask)
-            && IMPLICATION((lhs.flags & scale_adjust)
-                            && !extra_flag_rnn_s8s8_compensation_is_set(
-                                    lhs.flags),
+            && IMPLICATION(lhs.flags & scale_adjust,
                     lhs.scale_adjust == rhs.scale_adjust)
             && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src,
                     lhs.asymm_compensation_mask == rhs.asymm_compensation_mask);
diff --git a/src/cpu/rnn/rnn_reorders.hpp b/src/cpu/rnn/rnn_reorders.hpp
index 5156350d860..e96828d369c 100644
--- a/src/cpu/rnn/rnn_reorders.hpp
+++ b/src/cpu/rnn/rnn_reorders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -779,12 +779,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
                 return unimplemented;
 
             // Check the proper memory desc has been passed to u8s8 and s8s8
-            // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation
-            // have common bit so we have to perform additional checks to
-            // separate these two cases
             const bool check_u8s8 = (od.extra().flags & rnn_u8s8_compensation)
-                    && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                            od.extra().flags)
                     && od.extra().compensation_mask
                             == ((id.ndims() == 5) ? 27 /* 11011 */
                                                   : 13 /* 1101 */);
@@ -886,9 +881,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
                           .template get<void>(memory_tracking::names::
                                           key_reorder_rnn_weights_reduction);
         float *comp = reinterpret_cast<float *>(dst + compensation_offset);
-        const bool req_s8s8_comp = (dst_d.extra().flags & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        dst_d.extra().flags);
+        const bool req_s8s8_comp = dst_d.extra().flags & rnn_u8s8_compensation;
         const auto mask_ok = [&](int mask) {
             return mask
                     == ((src_d.ndims() == 5) ? 27 /* 11011 */
diff --git a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
index 80f1ed4c0b3..5b72142ce0a 100644
--- a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
+++ b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,14 +42,8 @@ struct rnn_weights_reorder_t : public gpu_primitive_t {
 
         status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
                 impl::engine_t *dst_engine) {
-            // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation
-            // have common bit so we have to perform additional checks to
-            // separate these two cases
-            VDISPATCH_REORDER(
-                    !IMPLICATION(dst_md()->extra.flags
-                                    & memory_extra_flags::rnn_u8s8_compensation,
-                            types::extra_flag_rnn_s8s8_compensation_is_set(
-                                    dst_md()->extra.flags)),
+            VDISPATCH_REORDER(dst_md()->extra.flags
+                            & memory_extra_flags::rnn_u8s8_compensation,
                     VERBOSE_BAD_FLAGS);
 
             VDISPATCH_REORDER(utils::one_of(src_engine->kind(),

From ce8008183a8c7ebd41f1edf4c418ffd96af5074d Mon Sep 17 00:00:00 2001
From: "Guskov, Andrey Y" <andrey.y.guskov@intel.com>
Date: Wed, 8 Jan 2025 23:30:10 -0800
Subject: [PATCH 2/2] src: gpu: intel: jit: conv: add reorder-based precomputed
 zero points

---
 src/common/memory_desc.cpp                    |   7 +-
 src/common/memory_desc.hpp                    |  41 ++++-
 src/common/memory_desc_wrapper.hpp            |   8 +
 src/common/primitive_hashing.cpp              |   9 +
 src/common/serialization.cpp                  |   8 +
 src/common/type_helpers.hpp                   |   8 +-
 src/common/verbose.cpp                        |  15 ++
 src/cpu/reorder/cpu_reorder_pd.hpp            |   5 +-
 src/gpu/generic/convolution_deconvolution.hpp |   9 +-
 src/gpu/generic/cross_engine_reorder.cpp      |  52 ++++--
 src/gpu/generic/cross_engine_reorder.hpp      |   9 +-
 src/gpu/gpu_reorder_pd.cpp                    | 101 +++++++++++
 src/gpu/gpu_reorder_pd.hpp                    |  32 +++-
 src/gpu/gpu_utils.hpp                         |   2 +-
 src/gpu/gpu_zero_points_conv.cpp              |  96 +++++++++++
 src/gpu/gpu_zero_points_conv.hpp              |  36 ++++
 src/gpu/intel/jit/codegen/kernel.hpp          |  57 ++++--
 src/gpu/intel/jit/codegen/reorder.hpp         |  12 +-
 src/gpu/intel/jit/conv/config.cpp             |  77 +++++++++
 src/gpu/intel/jit/conv/config.hpp             |   4 +-
 src/gpu/intel/jit/conv/gen_convolution.cpp    | 163 ++++++++----------
 src/gpu/intel/jit/conv/normalization.cpp      |  27 +--
 src/gpu/intel/jit/conv/normalization.hpp      |   5 +-
 src/gpu/intel/jit/conv/zp_plan.cpp            |  12 +-
 src/gpu/intel/jit/conv/zp_plan.hpp            |   3 +-
 src/gpu/intel/jit/ir/epilogue.cpp             |   3 +
 src/gpu/intel/jit/ir/kernel_info.hpp          |   6 +-
 src/gpu/intel/jit/ir/post_ops.cpp             |  25 ++-
 src/gpu/intel/jit/ir/post_ops.hpp             |  29 +++-
 src/gpu/intel/jit/ir/tensor_config.cpp        |  10 +-
 src/gpu/intel/jit/reorder/gen_reorder.cpp     |  12 +-
 src/gpu/intel/jit/reorder/gen_reorder.hpp     |   3 +-
 src/gpu/intel/ocl/ref_reorder.cpp             |  26 +--
 src/gpu/intel/ocl/ref_reorder.hpp             |  10 +-
 34 files changed, 703 insertions(+), 219 deletions(-)
 create mode 100644 src/gpu/gpu_reorder_pd.cpp
 create mode 100644 src/gpu/gpu_zero_points_conv.cpp
 create mode 100644 src/gpu/gpu_zero_points_conv.hpp

diff --git a/src/common/memory_desc.cpp b/src/common/memory_desc.cpp
index f9345a72302..5d5a0958b52 100644
--- a/src/common/memory_desc.cpp
+++ b/src/common/memory_desc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -471,8 +471,9 @@ status_t memory_desc_permute_axes(memory_desc_t &out_memory_desc,
     VCHECK_MEMORY(
             !memory_desc_wrapper(in_memory_desc).has_runtime_dims_or_strides(),
             invalid_arguments, VERBOSE_UNSUPPORTED_MEM_STRIDE);
-    VCHECK_MEMORY(in_memory_desc.extra.flags == 0, invalid_arguments,
-            VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
+    VCHECK_MEMORY(
+            check_md_extra_flags_compensation_gpu(in_memory_desc.extra.flags),
+            invalid_arguments, VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
 
     // verify that perm is indeed a permutation of [0 .. ndims)
     unsigned occurrence_mask = 0;
diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp
index 5dc820c67c1..3b9dd8d0b1e 100644
--- a/src/common/memory_desc.hpp
+++ b/src/common/memory_desc.hpp
@@ -71,6 +71,15 @@ enum memory_extra_flags_t {
     = dnnl_memory_extra_flag_rnn_u8s8_compensation,
     dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u,
     dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u,
+    // This flag has to be kept separate from *compensation_conv_asymmetric_src
+    // since the GPU precompute algorithm is incompatible with that of the CPU
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src = 32u,
+    // This flag depends on *compensation_gpu_conv_asymmetric_src and is used
+    // when precompute is to be performed for a backward-by-data convolution
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd = 64u,
+    // This flag depends on *compensation_gpu_conv_asymmetric_src and is used
+    // when IC and OC are swapped to reinterpret a deconv as a BWD_D conv
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap = 128u,
 };
 
 // Create aliases for extra flags to preserve the old behavior.
@@ -87,8 +96,23 @@ const memory_extra_flags_t rnn_s8s8_compensation
         = dnnl_memory_extra_flag_rnn_s8s8_compensation;
 const memory_extra_flags_t compensation_conv_asymmetric_src
         = dnnl_memory_extra_flag_compensation_conv_asymmetric_src;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_bwd
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_swap
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap;
 } // namespace memory_extra_flags
 
+inline bool check_md_extra_flags_compensation_gpu(uint64_t flags) {
+    using namespace memory_extra_flags;
+    const uint64_t c = compensation_gpu_conv_asymmetric_src;
+    const uint64_t b = compensation_gpu_conv_asymmetric_src_bwd;
+    const uint64_t s = compensation_gpu_conv_asymmetric_src_swap;
+    return (flags == none) || (flags == c) || (flags == (c | b))
+            || (flags == (c | b | s));
+}
+
 // Generic description of blocked data layout for most memory formats.
 struct blocking_desc_t {
     // The strides between the outermost blocks.
@@ -208,7 +232,12 @@ struct memory_extra_desc_t {
         : flags(0)
         , compensation_mask(0)
         , scale_adjust(0.0f)
-        , asymm_compensation_mask(0) {}
+        , asymm_compensation_mask(0)
+        , idhw {0, 0, 0}
+        , odhw {0, 0, 0}
+        , pdhw {0, 0, 0}
+        , ddhw {0, 0, 0}
+        , dst_size(0) {}
     // The flags contain arbitrary extra information, such as compensation.
     // @sa dnnl_memory_extra_flags_t
     uint64_t flags;
@@ -218,6 +247,16 @@ struct memory_extra_desc_t {
     float scale_adjust;
     // Compensation mask for asymmetric quantization
     int asymm_compensation_mask;
+    // Precomp GPU ZP convolution input spatials
+    dim_t idhw[3];
+    // Precomp GPU ZP convolution output spatials
+    dim_t odhw[3];
+    // Precomp GPU ZP convolution padding spatials
+    dim_t pdhw[3];
+    // Precomp GPU ZP convolution dilation spatials
+    dim_t ddhw[3];
+    // Precomp GPU ZP convolution destination size
+    dim_t dst_size;
 };
 
 status_t DNNL_API memory_desc_init_by_tag(memory_desc_t &memory_desc, int ndims,
diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp
index 847951ba558..9b32468975b 100644
--- a/src/common/memory_desc_wrapper.hpp
+++ b/src/common/memory_desc_wrapper.hpp
@@ -152,6 +152,8 @@ struct memory_desc_wrapper : public c_compatible {
         if (flag_select & rnn_u8s8_compensation) return sizeof(float);
         if (flag_select & compensation_conv_asymmetric_src)
             return sizeof(int32_t);
+        if (flag_select & compensation_gpu_conv_asymmetric_src)
+            return sizeof(int32_t);
         return 0;
     }
 
@@ -160,6 +162,7 @@ struct memory_desc_wrapper : public c_compatible {
         using namespace memory_extra_flags;
         return extra().flags
                 & (compensation_conv_s8s8 | rnn_u8s8_compensation
+                        | compensation_gpu_conv_asymmetric_src
                         | compensation_conv_asymmetric_src);
     }
 
@@ -193,6 +196,9 @@ struct memory_desc_wrapper : public c_compatible {
             return calculate_size(extra().asymm_compensation_mask,
                     additional_buffer_data_size(flag));
         }
+        if (flag == compensation_gpu_conv_asymmetric_src) {
+            return extra().dst_size;
+        }
 
         return 0;
     }
@@ -212,6 +218,8 @@ struct memory_desc_wrapper : public c_compatible {
         buff_size += additional_buffer_size(compensation_conv_s8s8);
         buff_size += additional_buffer_size(rnn_u8s8_compensation);
         buff_size += additional_buffer_size(compensation_conv_asymmetric_src);
+        buff_size
+                += additional_buffer_size(compensation_gpu_conv_asymmetric_src);
         return buff_size;
     }
 
diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp
index a8d9f25ce8c..a7a0f9ed295 100644
--- a/src/common/primitive_hashing.cpp
+++ b/src/common/primitive_hashing.cpp
@@ -204,6 +204,15 @@ size_t get_md_hash(const memory_desc_t &md) {
                 & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
             seed = hash_combine(seed, md.extra.asymm_compensation_mask);
         }
+
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
+            seed = get_array_hash(seed, md.extra.idhw, 3);
+            seed = get_array_hash(seed, md.extra.odhw, 3);
+            seed = get_array_hash(seed, md.extra.pdhw, 3);
+            seed = get_array_hash(seed, md.extra.ddhw, 3);
+            seed = hash_combine(seed, md.extra.dst_size);
+        }
     }
     // Combined hash for a memory descriptor
     return seed;
diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp
index 8e40dd29819..afe9c37f49e 100644
--- a/src/common/serialization.cpp
+++ b/src/common/serialization.cpp
@@ -132,6 +132,14 @@ void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) {
                 & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
             sstream.write(&md.extra.asymm_compensation_mask);
         }
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
+            sstream.write(md.extra.idhw, 3);
+            sstream.write(md.extra.odhw, 3);
+            sstream.write(md.extra.pdhw, 3);
+            sstream.write(md.extra.ddhw, 3);
+            sstream.write(&md.extra.dst_size);
+        }
     }
 }
 
diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp
index 7a6efb9d986..c8abbbe4364 100644
--- a/src/common/type_helpers.hpp
+++ b/src/common/type_helpers.hpp
@@ -310,7 +310,13 @@ inline bool memory_extra_desc_is_equal(
             && IMPLICATION(lhs.flags & scale_adjust,
                     lhs.scale_adjust == rhs.scale_adjust)
             && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src,
-                    lhs.asymm_compensation_mask == rhs.asymm_compensation_mask);
+                    lhs.asymm_compensation_mask == rhs.asymm_compensation_mask)
+            && IMPLICATION(lhs.flags & compensation_gpu_conv_asymmetric_src,
+                    (lhs.dst_size == rhs.dst_size)
+                            && utils::array_cmp(lhs.idhw, rhs.idhw, 3)
+                            && utils::array_cmp(lhs.odhw, rhs.odhw, 3)
+                            && utils::array_cmp(lhs.pdhw, rhs.pdhw, 3)
+                            && utils::array_cmp(lhs.ddhw, rhs.ddhw, 3));
 }
 
 inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md,
diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp
index 63a9df9a2e1..76dfb31f46b 100644
--- a/src/common/verbose.cpp
+++ b/src/common/verbose.cpp
@@ -414,6 +414,21 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) {
         ss << ":s8m" << extra.compensation_mask;
     if (extra.flags & compensation_conv_asymmetric_src)
         ss << ":zpm" << extra.asymm_compensation_mask;
+    if (extra.flags & compensation_gpu_conv_asymmetric_src) {
+        ss << ":zid" << extra.idhw[0];
+        ss << ":zih" << extra.idhw[1];
+        ss << ":ziw" << extra.idhw[2];
+        ss << ":zod" << extra.odhw[0];
+        ss << ":zoh" << extra.odhw[1];
+        ss << ":zow" << extra.odhw[2];
+        ss << ":zpd" << extra.pdhw[0];
+        ss << ":zph" << extra.pdhw[1];
+        ss << ":zpw" << extra.pdhw[2];
+        ss << ":zdd" << extra.ddhw[0];
+        ss << ":zdh" << extra.ddhw[1];
+        ss << ":zdw" << extra.ddhw[2];
+        ss << ":zs" << extra.dst_size;
+    }
     if (extra.flags & scale_adjust && extra.scale_adjust != 1.f)
         ss << ":sa" << extra.scale_adjust;
     return ss;
diff --git a/src/cpu/reorder/cpu_reorder_pd.hpp b/src/cpu/reorder/cpu_reorder_pd.hpp
index d1c8499c151..ca69992b0fe 100644
--- a/src/cpu/reorder/cpu_reorder_pd.hpp
+++ b/src/cpu/reorder/cpu_reorder_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,6 +38,9 @@ struct cpu_reorder_pd_t : public reorder_pd_t {
                 post_ops.len() == 1
                         && post_ops.entry_[0].kind == primitive_kind::sum);
         VDISPATCH_REORDER(args_ok, VERBOSE_UNSUPPORTED_POSTOP);
+        auto gpu_zp = memory_extra_flags::compensation_gpu_conv_asymmetric_src;
+        VDISPATCH_REORDER(!(dst_md()->extra.flags & gpu_zp),
+                VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
         return status::success;
     }
 
diff --git a/src/gpu/generic/convolution_deconvolution.hpp b/src/gpu/generic/convolution_deconvolution.hpp
index 74893d4c5db..1c07d94522d 100644
--- a/src/gpu/generic/convolution_deconvolution.hpp
+++ b/src/gpu/generic/convolution_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,12 +32,15 @@ namespace generic {
 
 static status_t weights_axes_permutation(
         memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) {
+    using namespace memory_extra_flags;
     int perm[DNNL_MAX_NDIMS] {}; // deconv to conv weight permutation
     for (int d = 0; d < DNNL_MAX_NDIMS; ++d)
         perm[d] = d;
     nstl::swap(perm[0 + with_groups], perm[1 + with_groups]);
-
-    return memory_desc_permute_axes(*o_md, *i_md, perm);
+    CHECK(memory_desc_permute_axes(*o_md, *i_md, perm));
+    if (o_md->extra.flags & compensation_gpu_conv_asymmetric_src)
+        o_md->extra.flags |= compensation_gpu_conv_asymmetric_src_swap;
+    return status::success;
 }
 
 static status_t conv_descr_create(
diff --git a/src/gpu/generic/cross_engine_reorder.cpp b/src/gpu/generic/cross_engine_reorder.cpp
index 6ded618a9c9..cbf4672c4c6 100644
--- a/src/gpu/generic/cross_engine_reorder.cpp
+++ b/src/gpu/generic/cross_engine_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,20 +27,18 @@ namespace impl {
 namespace gpu {
 namespace generic {
 
-void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *engine) {
-    using namespace memory_tracking::names;
-    if (!do_reorder_) return;
-
-    auto *gpu_engine = utils::downcast<gpu::engine_t *>(engine);
-
-    const memory_desc_wrapper wspace_md(
-            desc()->src_engine_kind == reorder_engine_kind_ ? dst_md()
-                                                            : src_md());
-    auto scratchpad = scratchpad_registry().registrar();
-    scratchpad.book(memory_tracking::names::key_reorder_cross_space,
-            wspace_md.size(), 1, gpu_engine->get_buffer_alignment());
-    scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), 1,
-            gpu_engine->get_buffer_alignment());
+void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *gpu_engine) {
+    if (do_reorder_) {
+        using namespace memory_tracking::names;
+        auto gpu_align = utils::downcast<gpu::engine_t *>(gpu_engine)
+                                 ->get_buffer_alignment();
+        auto scratchpad = scratchpad_registry().registrar();
+        auto needs_dst = desc()->src_engine_kind == reorder_engine_kind_;
+        memory_desc_wrapper wspace((needs_dst) ? dst_md() : src_md());
+        scratchpad.book(key_reorder_cross_space, wspace.size(), 1, gpu_align);
+        scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(),
+                1, gpu_align);
+    }
 }
 
 status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
@@ -50,7 +48,7 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
                               dst_engine->kind()),
             VERBOSE_BAD_ENGINE_KIND);
     VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR);
-    VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
+    VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
 
     memory_desc_wrapper src_mdw(src_md());
     memory_desc_wrapper dst_mdw(dst_md());
@@ -72,17 +70,31 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
     primitive_attr_t r_attr(*attr());
     if (!r_attr.is_initialized()) return status::out_of_memory;
 
-    VDISPATCH_REORDER_SC(reorder_primitive_desc_create(reorder_pd_,
-                                 reorder_engine, src_md(), dst_md(), &r_attr),
+    auto clean_src_md = *src_md();
+    auto clean_dst_md = *dst_md();
+    clean_src_md.extra = clean_dst_md.extra = {};
+    VDISPATCH_REORDER_SC(
+            reorder_primitive_desc_create(reorder_pd_, reorder_engine,
+                    &clean_src_md, &clean_dst_md, &r_attr),
             VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder");
-    init_scratchpad(engine);
 
     reorder_pd_t::init_desc(
             src_engine->kind(), dst_engine->kind(), true /* is_cross_engine */);
 
+    VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+            "failed to create nested zp precompute convolution");
+    init_scratchpad(
+            (dst_engine->kind() == engine_kind::gpu) ? dst_engine : src_engine);
     return status::success;
 }
 
+status_t cross_engine_reorder_t::init(impl::engine_t *engine) {
+    CHECK(pd()->maybe_create_zp_precompute_conv(
+            zp_precomp_conv_, engine, this));
+    if (!pd()->do_reorder_) return status::success;
+    return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
+}
+
 status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
     using namespace memory_tracking::names;
     auto *gpu_stream = utils::downcast<gpu::stream_t *>(ctx.stream());
@@ -158,6 +170,8 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
                     ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC),
                     ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST));
         }
+        if (status == status::success)
+            status = pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_);
     }
     return status;
 }
diff --git a/src/gpu/generic/cross_engine_reorder.hpp b/src/gpu/generic/cross_engine_reorder.hpp
index cd69fefefaf..c6557ddaaeb 100644
--- a/src/gpu/generic/cross_engine_reorder.hpp
+++ b/src/gpu/generic/cross_engine_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,16 +57,13 @@ struct cross_engine_reorder_t : public gpu::primitive_t {
         DECLARE_GPU_REORDER_CREATE();
     };
 
-    status_t init(impl::engine_t *engine) override {
-        if (!pd()->do_reorder_) return status::success;
-        return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
-    }
-
+    status_t init(impl::engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::shared_ptr<impl::primitive_t> reorder_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace generic
diff --git a/src/gpu/gpu_reorder_pd.cpp b/src/gpu/gpu_reorder_pd.cpp
new file mode 100644
index 00000000000..ca293db5c89
--- /dev/null
+++ b/src/gpu/gpu_reorder_pd.cpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/gpu_reorder_pd.hpp"
+#include "gpu/gpu_engine.hpp"
+#include "gpu/gpu_stream.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv_pd(
+        impl::engine_t *dst_engine) {
+    memory_desc_wrapper dst_mdw(dst_md());
+    auto &extra = dst_mdw.extra();
+    auto needs_conv = memory_extra_flags::compensation_gpu_conv_asymmetric_src;
+    auto is_dst_gpu = (dst_engine->kind() == engine_kind::gpu);
+    do_zp_precomp_conv_ = is_dst_gpu && (extra.flags & needs_conv);
+    if (!do_zp_precomp_conv_) return status::success;
+
+    using namespace memory_extra_flags;
+    const auto out_type = data_type::f32;
+    primitive_attr_t attr;
+    const bool is_bwd_d
+            = extra.flags & compensation_gpu_conv_asymmetric_src_bwd;
+    auto prop = (is_bwd_d) ? prop_kind::backward_data
+                           : prop_kind::forward_inference;
+    CHECK(create_zp_precompute_conv_pd(zp_precomp_conv_pd_, dst_engine, attr,
+            dst_md(), extra.idhw, extra.odhw, extra.pdhw, extra.ddhw, out_type,
+            prop));
+
+    using namespace memory_tracking::names;
+    auto gpu_align = utils::downcast<gpu::engine_t *>(dst_engine)
+                             ->get_buffer_alignment();
+    auto scratchpad = scratchpad_registry().registrar();
+    auto registry = zp_precomp_conv_pd_->scratchpad_registry();
+    memory_desc_wrapper wspace((is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md()
+                                          : zp_precomp_conv_pd_->src_md());
+    scratchpad.book(key_conv_tr_src, wspace.size(), 1, gpu_align);
+    scratchpad.book(key_conv_tails, registry.size(), 1, gpu_align);
+    return status::success;
+}
+
+status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv(
+        std::shared_ptr<impl::primitive_t> &zp_precomp_conv,
+        impl::engine_t *engine, gpu::primitive_t *primitive) const {
+    if (!do_zp_precomp_conv_) return status::success;
+    return primitive->create_nested_primitive(
+            zp_precomp_conv, zp_precomp_conv_pd_, engine);
+}
+
+status_t gpu_reorder_pd_t::maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx,
+        const std::shared_ptr<impl::primitive_t> &zp_precomp_conv) const {
+    using namespace memory_tracking::names;
+    if (!do_zp_precomp_conv_) return status::success;
+
+    const bool is_bwd_d = (zp_precomp_conv_pd_->get_prop_kind()
+            == prop_kind::backward_data);
+    auto *gpu_stream = utils::downcast<gpu::stream_t *>(ctx.stream());
+    auto conv_md_in = (is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md()
+                                 : zp_precomp_conv_pd_->src_md();
+    auto scratchpad
+            = ctx.get_scratchpad_grantor().get_memory_storage(key_conv_tr_src);
+    std::unique_ptr<memory_t, memory_deleter_t> wspace;
+    CHECK(safe_ptr_assign(wspace,
+            new memory_t(ctx.stream()->engine(), conv_md_in,
+                    std::move(scratchpad))));
+    CHECK(gpu_stream->fill(*wspace->memory_storage(), 0x01,
+            memory_desc_wrapper(conv_md_in).size(),
+            gpu_stream->ctx().get_deps(), gpu_stream->ctx().get_deps()));
+
+    exec_args_t r_args;
+    auto arg_in = (is_bwd_d) ? DNNL_ARG_DIFF_DST : DNNL_ARG_SRC;
+    auto arg_out = (is_bwd_d) ? DNNL_ARG_DIFF_SRC : DNNL_ARG_DST;
+    r_args[arg_in] = memory_arg_t {(memory_t *)wspace.get(), true};
+    r_args[DNNL_ARG_WEIGHTS] = memory_arg_t {ctx.output(DNNL_ARG_TO), true};
+    r_args[arg_out] = memory_arg_t {ctx.output(DNNL_ARG_TO), false};
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+    nested_scratchpad_t ns(ctx, key_conv_tails, zp_precomp_conv);
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+    return zp_precomp_conv->execute(r_ctx);
+}
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/gpu_reorder_pd.hpp b/src/gpu/gpu_reorder_pd.hpp
index d70c28bdd81..71617d96dc8 100644
--- a/src/gpu/gpu_reorder_pd.hpp
+++ b/src/gpu/gpu_reorder_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_GPU_REORDER_PD_HPP
 
 #include "common/reorder_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,10 +29,9 @@ struct gpu_reorder_pd_t : public reorder_pd_t {
 
 protected:
     bool attr_ok() const {
-        return attr()->has_default_values(
-                       dnnl_primitive_attr::skip_mask_t::zero_points_runtime
-                       | dnnl_primitive_attr::skip_mask_t::scales_runtime
-                       | dnnl_primitive_attr::skip_mask_t::post_ops)
+        using sm = dnnl_primitive_attr::skip_mask_t;
+        return attr()->has_default_values(sm::zero_points_runtime
+                       | sm::scales_runtime | sm::post_ops)
                 && post_ops_ok() && zero_points_ok();
     }
 
@@ -62,9 +62,27 @@ struct gpu_reorder_pd_t : public reorder_pd_t {
                         && post_ops.entry_[0].kind == primitive_kind::sum);
     }
 
-    bool extra_ok() const {
-        return src_md()->extra.flags == 0 && dst_md()->extra.flags == 0;
+    bool extra_ok(bool accept_conv_asymm = false) const {
+        if (!accept_conv_asymm)
+            return (src_md()->extra.flags == memory_extra_flags::none)
+                    && (dst_md()->extra.flags == memory_extra_flags::none);
+        return check_md_extra_flags_compensation_gpu(src_md()->extra.flags)
+                && check_md_extra_flags_compensation_gpu(dst_md()->extra.flags);
     }
+
+    status_t maybe_create_zp_precompute_conv_pd(impl::engine_t *dst_engine);
+
+public:
+    status_t maybe_create_zp_precompute_conv(
+            std::shared_ptr<impl::primitive_t> &zp_precomp_conv,
+            impl::engine_t *engine, gpu::primitive_t *primitive) const;
+
+    status_t maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx,
+            const std::shared_ptr<impl::primitive_t> &zp_precomp_conv) const;
+
+private:
+    bool do_zp_precomp_conv_ = false;
+    std::shared_ptr<primitive_desc_t> zp_precomp_conv_pd_;
 };
 
 } // namespace gpu
diff --git a/src/gpu/gpu_utils.hpp b/src/gpu/gpu_utils.hpp
index 18c82b1dccc..fe56ccaba41 100644
--- a/src/gpu/gpu_utils.hpp
+++ b/src/gpu/gpu_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/src/gpu/gpu_zero_points_conv.cpp b/src/gpu/gpu_zero_points_conv.cpp
new file mode 100644
index 00000000000..0e1edb567d9
--- /dev/null
+++ b/src/gpu/gpu_zero_points_conv.cpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "common/convolution_pd.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t create_zp_precompute_conv_pd(std::shared_ptr<primitive_desc_t> &retn,
+        dnnl::impl::engine_t *eng, const primitive_attr_t &attr,
+        const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw,
+        const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type,
+        prop_kind_t prop, bool has_offset0) {
+    using namespace memory_extra_flags;
+    auto real_wei = *wei;
+    const int off = (!idhw[1]) ? 2 + !idhw[2] : !idhw[0];
+    const bool with_groups = (real_wei.ndims == (6 - off));
+    if (real_wei.extra.flags & compensation_gpu_conv_asymmetric_src_swap) {
+        static_assert(DNNL_MAX_NDIMS == 12, "DNNL_MAX_NDIMS is not 12");
+        std::array<int, DNNL_MAX_NDIMS> perm_grp
+                = {0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+        std::array<int, DNNL_MAX_NDIMS> perm_no_grp
+                = {1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+        CHECK(memory_desc_permute_axes(real_wei, *wei,
+                (with_groups) ? perm_grp.data() : perm_no_grp.data()));
+    }
+    real_wei.extra = memory_extra_desc_t();
+
+    const auto &dims = real_wei.dims;
+    const bool is_fwd = ((prop == prop_kind::forward_training)
+            || (prop == prop_kind::forward_inference));
+    const bool is_bwd_d = (prop == prop_kind::backward_data);
+    assert((off < 3) && (real_wei.ndims >= 5 - off) && (is_fwd || is_bwd_d));
+    MAYBE_UNUSED(is_fwd);
+
+    using memory_dims = std::vector<dim_t>;
+    memory_dims S1 {1, 1, 1};
+    memory_dims P1 {0, 0, 0};
+    // dim order for weights: [G,] OC, IC, [[[D,] H,] W]
+    memory_dims dims_in {1,
+            (with_groups) ? dims[0] * dims[2 - is_bwd_d] : dims[1 - is_bwd_d]};
+    memory_dims dims_out {1,
+            (with_groups) ? dims[0] * dims[1 + is_bwd_d] : dims[0 + is_bwd_d]};
+    for (int i = off; i < 3; i++) {
+        const auto k_idx = 2 + with_groups + i - off;
+        const auto KD = (dims[k_idx] - 1) * (ddhw[i] + 1) + 1;
+        dims_in.emplace_back(idhw[i]);
+        dims_out.emplace_back(odhw[i]);
+        P1[i] = dims_out.back() - dims_in.back() - 1 + KD - pdhw[i];
+    }
+
+    memory_desc_t in, out;
+    CHECK(memory_desc_init_by_tag(out, int(dims_out.size()), dims_out.data(),
+            out_type, format_tag::any));
+    CHECK(memory_desc_init_by_tag(in, int(dims_in.size()), dims_in.data(),
+            data_type::s8, format_tag::any));
+
+    if (has_offset0) {
+        auto out_type_size = types::data_type_size(out_type);
+        auto offset0 = memory_desc_wrapper(real_wei).size(0, false);
+        assert(offset0 % out_type_size == 0);
+        out.offset0 = offset0 / out_type_size;
+    }
+    auto conv_desc = convolution_desc_t();
+    CHECK(dnnl::impl::conv_desc_init(&conv_desc, prop,
+            alg_kind::convolution_direct, (is_bwd_d) ? &out : &in, &real_wei,
+            nullptr, (is_bwd_d) ? &in : &out, S1.data() + off, ddhw + off,
+            pdhw + off, P1.data() + off));
+    primitive_desc_iterator_t it(eng, (op_desc_t *)&conv_desc, &attr, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+    retn = *(++it);
+    return (retn) ? status::success : status::unimplemented;
+}
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/gpu_zero_points_conv.hpp b/src/gpu/gpu_zero_points_conv.hpp
new file mode 100644
index 00000000000..e287454b4ec
--- /dev/null
+++ b/src/gpu/gpu_zero_points_conv.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GPU_ZERO_POINTS_CONV_HPP
+#define GPU_GPU_ZERO_POINTS_CONV_HPP
+
+#include "common/primitive_desc.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t create_zp_precompute_conv_pd(std::shared_ptr<primitive_desc_t> &retn,
+        dnnl::impl::engine_t *eng, const primitive_attr_t &attr,
+        const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw,
+        const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type,
+        prop_kind_t prop, bool has_offset0 = true);
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp
index 150918d9a9c..9a206c11554 100644
--- a/src/gpu/intel/jit/codegen/kernel.hpp
+++ b/src/gpu/intel/jit/codegen/kernel.hpp
@@ -28,6 +28,8 @@
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/codegen/operand.hpp"
 #include "gpu/intel/jit/codegen/register_allocator.hpp"
+#include "gpu/intel/jit/codegen/register_scope.hpp"
+#include "gpu/intel/jit/codegen/reorder.hpp"
 #include "gpu/intel/jit/emulation.hpp"
 #include "gpu/intel/jit/ir/ir.hpp"
 #include "gpu/intel/jit/ir/ir_builder.hpp"
@@ -591,20 +593,28 @@ class ir_kernel_t : public jit_generator<hw> {
     }
 
     void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
-            const ngen_operand_t &src0, const ngen_operand_t &src1,
-            const ngen_operand_t &src2) {
+            const ngen_operand_t &_src0, const ngen_operand_t &_src1,
+            const ngen_operand_t &_src2) {
+        auto src0 = _src0;
+        auto src1 = _src1;
+        auto src2 = _src2;
+        auto scope = ngen_register_scope_t(ra_);
+        align_src_dst_offset(this, scope, mod, dst, src0);
+        align_src_dst_offset(this, scope, mod, dst, src1);
         if (hw >= ngen::HW::XeHP) {
             if (src2.is_reg_data()) {
-                add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                        src2.reg_data());
+                align_src_dst_offset(this, scope, mod, dst, src2);
+                add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                        fixup_ternary_rgn(src1.reg_data()), src2.reg_data());
             } else {
-                add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                        src2.immediate());
+                add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                        fixup_ternary_rgn(src1.reg_data()), src2.immediate());
             }
             return;
         }
         add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
         if (src2.is_reg_data()) {
+            align_src_dst_offset(this, scope, mod, dst, src2);
             add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data());
         } else {
             add(mod, dst.reg_data(), dst.reg_data(), src2.immediate());
@@ -612,26 +622,34 @@ class ir_kernel_t : public jit_generator<hw> {
     }
 
     void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
-            const ngen_operand_t &src0, const ngen_operand_t &src1,
-            const ngen_operand_t &src2) {
+            const ngen_operand_t &_src0, const ngen_operand_t &_src1,
+            const ngen_operand_t &_src2) {
+        auto src0 = _src0;
+        auto src1 = _src1;
+        auto src2 = _src2;
+        auto scope = ngen_register_scope_t(ra_);
+        align_src_dst_offset(this, scope, mod, dst, src1);
         if (src2.is_reg_data()) {
-            mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                    src2.reg_data());
+            align_src_dst_offset(this, scope, mod, dst, src0);
+            align_src_dst_offset(this, scope, mod, dst, src2);
+            mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                    fixup_ternary_rgn(src1.reg_data()), src2.reg_data());
         } else if (hw < ngen::HW::XeLP) {
+            align_src_dst_offset(this, scope, mod, dst, src0);
             mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate());
             add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data());
         } else if (src0.is_immediate()
                 && (ngen_is_dw(src0.type())
                         || src0.type() == ngen::DataType::uw)) {
             // dword immediate src0 is not supported, move to a register.
-            auto tmp_src0 = ra_.alloc_sub(src0.type());
+            auto tmp_src0 = scope.alloc_sub(src0.type());
             mov(1, tmp_src0, src0.immediate());
-            mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(),
-                    src2.immediate());
-            ra_.safeRelease(tmp_src0);
+            mad(mod, dst.reg_data(), tmp_src0,
+                    fixup_ternary_rgn(src1.reg_data()), src2.immediate());
         } else {
-            mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                    src2.immediate());
+            align_src_dst_offset(this, scope, mod, dst, src0);
+            mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                    fixup_ternary_rgn(src1.reg_data()), src2.immediate());
         }
     }
 
@@ -1144,6 +1162,13 @@ class ir_kernel_t : public jit_generator<hw> {
         return ir_utils::safe_divide(local_size, exec_cfg_.simd());
     }
 
+    static ngen::RegData fixup_ternary_rgn(const ngen::RegData &r) {
+        ngen::RegData retn = r;
+        return ((retn.getHS() == 1) && (retn.getVS() == retn.getWidth()))
+                ? retn.setRegion(1, 1, 0)
+                : retn;
+    }
+
     kernel_iface_t kernel_iface_;
     std::string kernel_name_;
     exec_config_t exec_cfg_;
diff --git a/src/gpu/intel/jit/codegen/reorder.hpp b/src/gpu/intel/jit/codegen/reorder.hpp
index 12d2187c8d1..aa4bc370794 100644
--- a/src/gpu/intel/jit/codegen/reorder.hpp
+++ b/src/gpu/intel/jit/codegen/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -1300,15 +1300,17 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
     int dst_off = dst.offset();
     int src_byte_off = src.byte_offset();
     int dst_byte_off = dst.byte_offset();
+    int esize = mod.getExecSize();
+    int grf_size = ngen::GRF::bytes(scope.hw());
+    int grf_src = grf_size / src.hs();
+    int grf_dst = grf_size / dst.hs();
 
     // If src is aligned with dst, return.
-    if ((is_xf || is_bf_to_f) && src_off == dst_off) return;
-    if (!is_xf && src_byte_off == dst_byte_off) return;
+    if ((is_xf || is_bf_to_f) && src_off % grf_src == dst_off % grf_dst) return;
+    if (!is_xf && src_byte_off % grf_size == dst_byte_off % grf_size) return;
 
     int new_src_byte_off = (is_xf ? dst_off * src_type_size : dst_byte_off);
 
-    int esize = mod.getExecSize();
-    int grf_size = ngen::GRF::bytes(scope.hw());
     int src_size = std::max(src_type_size * esize * src_stride, src_type_size);
 
     auto new_src = scope.alloc_reg_buf_data(
diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp
index 08bbb4e4166..fe36059d3e0 100644
--- a/src/gpu/intel/jit/conv/config.cpp
+++ b/src/gpu/intel/jit/conv/config.cpp
@@ -20,6 +20,7 @@
 #include <cstring>
 #include <mutex>
 
+#include "common/utils.hpp"
 #include "gpu/intel/jit/conv/grf_usage.hpp"
 #include "gpu/intel/jit/conv/message_patterns.hpp"
 #include "gpu/intel/jit/conv/normalization.hpp"
@@ -659,6 +660,61 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md,
     if (user_dst_req == "user") dst_tag = user_dst_tag = "user";
 }
 
+void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw,
+        dim_t *odhw, dim_t *pdhw, dim_t *ddhw) {
+    const bool is_bwd_d = (prb.prop_kind() == prop_kind::backward_data);
+    using memory_dims = std::vector<dim_t>;
+    memory_dims I {prb.id, prb.ih, prb.iw};
+    memory_dims O {prb.od, prb.oh, prb.ow};
+    memory_dims K {prb.kd, prb.kh, prb.kw};
+    memory_dims S {prb.sd, prb.sh, prb.sw};
+    memory_dims D {prb.dd, prb.dh, prb.dw};
+    memory_dims P {prb.pd, prb.ph, prb.pw};
+    const int off = 5 - prb.ndims;
+    const auto *w = prb.conv_pd->weights_md();
+
+    // restore the original layout of the prb values
+    const auto *s
+            = (is_bwd_d) ? prb.conv_pd->diff_dst_md() : prb.conv_pd->src_md();
+    const auto *d
+            = (is_bwd_d) ? prb.conv_pd->diff_src_md() : prb.conv_pd->dst_md();
+    auto has_dim = [&](int i) {
+        return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1)
+                || (w->dims[2 + i + prb.with_groups] > 1);
+    };
+    auto move_back = [&](int i, int off) {
+        if (off == 0) return;
+        I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1;
+        D[i - off] = P[i - off] = 0;
+        std::swap(I[i - off], I[i]);
+        std::swap(O[i - off], O[i]);
+        std::swap(K[i - off], K[i]);
+        std::swap(S[i - off], S[i]);
+        std::swap(D[i - off], D[i]);
+        std::swap(P[i - off], P[i]);
+    };
+    bool has_d = (off <= 0) && has_dim(0 - off);
+    bool has_h = (off <= 1) && has_dim(1 - off);
+    bool has_w = (off <= 2) && has_dim(2 - off);
+    if (!has_d && !has_h && !has_w) has_w = true;
+    move_back(1, has_d * (!has_h == has_w));
+    move_back(2, !has_w * (!has_h + 1));
+
+    for (int i = off; i < int(K.size()); i++) {
+        const auto KD = (K[i] - 1) * (D[i] + 1) + 1;
+        ir_assert(w->dims[2 + i + prb.with_groups - off] == K[i]);
+        O[i] = ir_utils::max_unique_pad_states(
+                O[i], I[i], KD, P[i], S[i], true);
+        I[i] = std::min(KD, I[i]);
+    }
+    for (int i = 0; i < 3; i++) {
+        idhw[i] = (i < off) ? 0 : I[i];
+        odhw[i] = (i < off) ? 0 : O[i];
+        pdhw[i] = (i < off) ? 0 : P[i];
+        ddhw[i] = (i < off) ? 0 : D[i];
+    }
+}
+
 status_t init_tensor_layouts(
         conv_config_t &cfg, convolution_pd_t *pd, impl::engine_t *engine) {
     const auto &prb = cfg.prb();
@@ -778,6 +834,27 @@ status_t init_tensor_layouts(
     bia.set_compute(bia_layout);
     bia.set_user(user_bia_layout);
 
+    if (cfg.zp_cfg().needs_src_reorder_precalc) {
+        auto get_channels = [](const layout_t &layout) {
+            const dim_t min_esize = 16;
+            return std::max(utils::rnd_up_pow2(layout.dim(1) * layout.dim(2)),
+                    min_esize);
+        };
+        using namespace memory_extra_flags;
+        prepare_zp_precompute_conv(prb, wei_md.extra.idhw, wei_md.extra.odhw,
+                wei_md.extra.pdhw, wei_md.extra.ddhw);
+
+        wei_md.extra.dst_size = sizeof(float);
+        for (const auto &o : wei_md.extra.odhw)
+            wei_md.extra.dst_size *= std::max(o, dim_t(1));
+        if (prb.prop_kind() == prop_kind::backward_data) {
+            wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src_bwd;
+            wei_md.extra.dst_size *= get_channels(src_layout);
+        } else {
+            wei_md.extra.dst_size *= get_channels(dst_layout);
+        }
+        wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src;
+    }
     return status::success;
 }
 
diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp
index f698d7ab546..b20e7240889 100644
--- a/src/gpu/intel/jit/conv/config.hpp
+++ b/src/gpu/intel/jit/conv/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -675,6 +675,8 @@ int default_regs(const conv_config_t &cfg);
 void init_kernel_grid(conv_config_t &cfg);
 void init_walk_order(conv_config_t &cfg);
 void init_thread_group_grid(conv_config_t &cfg);
+void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw,
+        dim_t *odhw, dim_t *pdhw, dim_t *ddhw);
 std::array<pvar_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg);
 std::array<pvar_tile_t, 3> get_thread_group_grid_conv_dims(
         const conv_config_t &cfg);
diff --git a/src/gpu/intel/jit/conv/gen_convolution.cpp b/src/gpu/intel/jit/conv/gen_convolution.cpp
index 3719f00f371..a292f27c98c 100644
--- a/src/gpu/intel/jit/conv/gen_convolution.cpp
+++ b/src/gpu/intel/jit/conv/gen_convolution.cpp
@@ -25,6 +25,7 @@
 #include "common/impl_registration.hpp"
 #include "common/utils.hpp"
 #include "common/verbose.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/reorder/reorder_kernel.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
@@ -45,8 +46,7 @@ struct conv_pd_data_t {
     conv_config_t pd_cfg;
     tensor_config_t tensor_cfg;
     std::vector<kernel_info_t> kernel_infos;
-    std::shared_ptr<dnnl_primitive_desc> zp_pd;
-    std::shared_ptr<impl::primitive_t> zp_prim;
+    std::shared_ptr<primitive_desc_t> zp_pd;
 };
 
 class gen_convolution_t {
@@ -72,79 +72,31 @@ class gen_convolution_t {
             CHECK(init_pd_time_cfg(
                     prb, pd->data->pd_cfg, engine, pd, &pd->attr_));
 
-            if (pd->data->pd_cfg.zp_cfg().needs_src_precalc) {
-                memory::dims I {prb.id, prb.ih, prb.iw};
-                memory::dims O {prb.od, prb.oh, prb.ow};
-                memory::dims K {prb.kd, prb.kh, prb.kw};
-                memory::dims S {prb.sd, prb.sh, prb.sw};
-                memory::dims D {prb.dd, prb.dh, prb.dw};
-                memory::dims P {prb.pd, prb.ph, prb.pw};
-                const int off = 5 - prb.ndims;
-                const auto *w = pd->invariant_wei_md();
-                { // restore the original layout of the prb values
-                    const auto *s = pd->invariant_src_md();
-                    const auto *d = pd->invariant_dst_md();
-                    auto has_dim = [&](int i) {
-                        return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1)
-                                || (w->dims[2 + i + prb.with_groups] > 1);
-                    };
-                    auto move_back = [&](int i, int off) {
-                        if (off == 0) return;
-                        I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1;
-                        D[i - off] = P[i - off] = 0;
-                        std::swap(I[i - off], I[i]);
-                        std::swap(O[i - off], O[i]);
-                        std::swap(K[i - off], K[i]);
-                        std::swap(S[i - off], S[i]);
-                        std::swap(D[i - off], D[i]);
-                        std::swap(P[i - off], P[i]);
-                    };
-                    bool has_d = (off <= 0) && has_dim(0 - off);
-                    bool has_h = (off <= 1) && has_dim(1 - off);
-                    bool has_w = (off <= 2) && has_dim(2 - off);
-                    if (!has_d && !has_h && !has_w) has_w = true;
-                    move_back(1, has_d * (!has_h == has_w));
-                    move_back(2, !has_w * (!has_h + 1));
+            if (pd->data->pd_cfg.zp_cfg().needs_src_reorder_precalc
+                    || pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                primitive_attr_t attr;
+                if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                    int mask = 0;
+                    CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask));
+                    attr.zero_points_.set(DNNL_ARG_SRC, mask);
+                    attr.post_ops_.append_eltwise(
+                            1.f, alg_kind::eltwise_linear, -1.f, 0.f);
                 }
-                memory::dims S1 {1, 1, 1};
-                memory::dims P1 {0, 0, 0};
-                memory::dims dims_src {1, dim_t(prb.g) * prb.ic};
-                memory::dims dims_dst {1, dim_t(prb.g) * prb.oc};
-
-                for (int i = off; i < int(K.size()); i++) {
-                    const auto KD = (K[i] - 1) * (D[i] + 1) + 1;
-                    dims_src.emplace_back(std::min(KD, I[i]));
-                    dims_dst.emplace_back(ir_utils::max_unique_pad_states(
-                            O[i], I[i], KD, P[i], S[i], true));
-                    P1[i] = dims_dst.back() - dims_src.back() - 1 + KD - P[i];
+                dim_t I[3], O[3], P[3], D[3];
+                prepare_zp_precompute_conv(prb, I, O, P, D);
+                CHECK(create_zp_precompute_conv_pd(pd->data->zp_pd, engine,
+                        attr, pd->weights_md(), I, O, P, D, data_type::f32,
+                        pd->get_prop_kind(),
+                        !pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc));
+                if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                    auto scratchpad = pd->scratchpad_registry().registrar();
+                    scratchpad.book(memory_tracking::names::key_nested_multiple,
+                            pd->data->zp_pd->scratchpad_registry());
                 }
-                memory::desc src(dims_src, memory::data_type::s8,
-                        memory::format_tag::any);
-                memory::desc dst(dims_dst, memory::data_type::s32,
-                        memory::format_tag::any);
-
-                // create a nested conv and allocate a nested scratchpad for it
-                primitive_attr_t attr;
-                int mask = 0;
-                CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask));
-                attr.zero_points_.set(DNNL_ARG_SRC, mask);
-                attr.post_ops_.append_eltwise(
-                        1.f, alg_kind_t::dnnl_eltwise_linear, -1.f, 0.f);
-                dnnl_primitive_desc *zp_pd;
-                CHECK(dnnl_convolution_forward_primitive_desc_create(&zp_pd,
-                        engine, dnnl_prop_kind_t::dnnl_forward_inference,
-                        dnnl_alg_kind_t::dnnl_convolution_direct, src.get(), w,
-                        nullptr, dst.get(), S1.data() + off, D.data() + off,
-                        P.data() + off, P1.data() + off, &attr));
-                pd->data->zp_pd.reset(zp_pd, dnnl_primitive_desc_destroy);
-                auto scratchpad = pd->scratchpad_registry().registrar();
-                scratchpad.book(memory_tracking::names::key_nested_multiple,
-                        pd->data->zp_pd->impl()->scratchpad_registry());
             }
 
-            pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg,
-                    (pd->data->zp_pd) ? pd->data->zp_pd->impl()->src_md()
-                                      : nullptr);
+            pd->data->tensor_cfg = get_tensor_config(
+                    pd->data->pd_cfg, zp_conv_md_in(*pd->data));
             pd->data->kernel_infos.reserve(max_kernels);
             CHECK(init_kernel_infos(pd));
 
@@ -176,7 +128,7 @@ class gen_convolution_t {
         int max_tries = 100;
         conv_config_t cfg;
         layout_t zp_dst;
-        if (data.zp_pd) zp_dst = layout_t(data.zp_pd->impl()->dst_md(), false);
+        if (data.zp_pd) zp_dst = layout_t(zp_conv_md_out(data), false);
 
         if (primitive->cache_blob()) {
             tiler->set_cur_version(primitive->version());
@@ -198,8 +150,17 @@ class gen_convolution_t {
                 ir_info() << cfg;
 
                 init_nd_ranges(primitive, cfg);
-
                 auto &kernel_infos = data.kernel_infos;
+
+                // This absolutely HAS to be executed first if present,
+                // since it adds its own version mark to the cache blob
+                for (int i = 0; i < int(kernel_infos.size()); i++)
+                    if (kernel_infos[i].id() == kernel_id_t::zp_precalc) {
+                        ir_assert(data.zp_pd);
+                        CHECK(primitive->create_nested_primitive(
+                                zp_prim_, data.zp_pd, engine));
+                    }
+
                 std::vector<compute::kernel_t> tmp_kernels;
                 for (int i = 0; i < int(kernel_infos.size()); i++) {
                     auto &info = kernel_infos[i];
@@ -248,10 +209,6 @@ class gen_convolution_t {
                             break;
 
                         case kernel_id_t::zp_precalc:
-                            ir_assert(data.zp_pd);
-                            if (!data.zp_prim)
-                                CHECK(data.zp_pd->impl()->create_primitive(
-                                        data.zp_prim, engine));
                             tmp_kernels.emplace_back();
                             continue;
 
@@ -319,12 +276,11 @@ class gen_convolution_t {
                                 new memory_t(ctx.stream()->engine(), md,
                                         std::move(s)));
                     };
-                    ir_assert(data.zp_prim);
+                    ir_assert(zp_prim_);
                     std::unique_ptr<memory_t, memory_deleter_t> zp_src, zp_dst;
-                    CHECK(scratchpad_arg(zp_src, "src_zero_points",
-                            data.zp_pd->impl()->src_md()));
                     CHECK(scratchpad_arg(
-                            zp_dst, "dst", data.zp_pd->impl()->dst_md()));
+                            zp_src, "src_zero_points", zp_conv_md_in(data)));
+                    CHECK(scratchpad_arg(zp_dst, "dst", zp_conv_md_out(data)));
 
                     exec_args_t e_args;
                     auto src_zp_idx = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
@@ -334,9 +290,9 @@ class gen_convolution_t {
                     e_args[DNNL_ARG_DST] = memory_arg_t {zp_dst.get(), false};
                     exec_ctx_t e_ctx(ctx, std::move(e_args));
                     const auto nm = memory_tracking::names::key_nested_multiple;
-                    nested_scratchpad_t ns(ctx, nm, data.zp_prim);
+                    nested_scratchpad_t ns(ctx, nm, zp_prim_);
                     e_ctx.set_scratchpad_grantor(ns.grantor());
-                    CHECK(data.zp_prim->execute(e_ctx));
+                    CHECK(zp_prim_->execute(e_ctx));
                 }
                 nsubmitted++;
                 if (nsubmitted == nkernels) break;
@@ -347,6 +303,20 @@ class gen_convolution_t {
     }
 
 private:
+    static const memory_desc_t *zp_conv_md_in(const conv_pd_data_t &data) {
+        if (!data.zp_pd) return nullptr;
+        const bool is_bwd_d
+                = (data.zp_pd->get_prop_kind() == prop_kind::backward_data);
+        return (is_bwd_d) ? data.zp_pd->diff_dst_md() : data.zp_pd->src_md();
+    }
+
+    static const memory_desc_t *zp_conv_md_out(const conv_pd_data_t &data) {
+        if (!data.zp_pd) return nullptr;
+        const bool is_bwd_d
+                = (data.zp_pd->get_prop_kind() == prop_kind::backward_data);
+        return (is_bwd_d) ? data.zp_pd->diff_src_md() : data.zp_pd->dst_md();
+    }
+
     template <typename T>
     static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) {
         auto &infos = pd->data->kernel_infos;
@@ -361,10 +331,8 @@ class gen_convolution_t {
     static status_t init_kernel_infos(T *pd) {
         auto &data = *pd->data;
         auto &cfg = data.pd_cfg;
-        const bool needs_zp_precalc = cfg.zp_cfg().needs_src_precalc;
-
         auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution);
-        auto &zp_precalc_info = (needs_zp_precalc)
+        auto &zp_precalc_info = (cfg.zp_cfg().needs_src_conv_precalc)
                 ? create_kernel_info(pd, kernel_id_t::zp_precalc)
                 : conv_info;
 
@@ -374,8 +342,10 @@ class gen_convolution_t {
         // Initialize kernel arguments.
         int scratchpad_key = memory_tracking::names::key_none;
         for (auto &t : data.tensor_cfg.tensors()) {
-            const bool src_zp_precalc
-                    = needs_zp_precalc && (t.name == "src_zero_points");
+            const bool wei_reorder_precalc = (t.name == "wei")
+                    && cfg.zp_cfg().needs_src_reorder_precalc;
+            const bool src_conv_precalc = (t.name == "src_zero_points")
+                    && cfg.zp_cfg().needs_src_conv_precalc;
 
             const auto compute_buf = make_buffer(t.name);
             size_t compute_size = t.compute_layout.size();
@@ -390,7 +360,7 @@ class gen_convolution_t {
 
             auto add_compute_arg = [&](kernel_info_t &ki, const expr_t &buf,
                                            bool is_input) {
-                if (t.needs_reorder || src_zp_precalc)
+                if (t.needs_reorder || src_conv_precalc)
                     ki.register_scratchpad_arg(
                             buf, compute_arg_key, is_input, compute_size);
                 else
@@ -411,12 +381,12 @@ class gen_convolution_t {
                 return zero_out_info;
             };
 
-            if (t.needs_reorder || src_zp_precalc) {
+            if (t.needs_reorder || src_conv_precalc) {
                 int user_arg_key = compute_arg_key;
                 auto user_buf = make_buffer(t.name + "_user");
                 compute_arg_key = ++scratchpad_key;
 
-                if (!src_zp_precalc && t.is_input) {
+                if (!src_conv_precalc && t.is_input) {
                     auto &reorder_info
                             = create_kernel_info(pd, kernel_id_t::pre_reorder);
                     reorder_info.register_user_arg(user_buf, user_arg_key,
@@ -425,7 +395,7 @@ class gen_convolution_t {
                     reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
                             cfg.exec_cfg(), t.user_layout, t.compute_layout));
                 }
-                if (!src_zp_precalc && t.is_output) {
+                if (!src_conv_precalc && t.is_output) {
                     auto &reorder_info
                             = create_kernel_info(pd, kernel_id_t::post_reorder);
                     add_compute_arg(reorder_info, compute_buf, true);
@@ -434,7 +404,7 @@ class gen_convolution_t {
                     reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
                             cfg.exec_cfg(), t.compute_layout, t.user_layout));
                 }
-                if (src_zp_precalc) {
+                if (src_conv_precalc) {
                     scratchpad_book(++scratchpad_key);
                     create_zero_out_info().register_scratchpad_arg(compute_buf,
                             scratchpad_key, /*is_input=*/false, compute_size);
@@ -456,6 +426,12 @@ class gen_convolution_t {
                     add_compute_arg(zp_precalc_info, make_buffer("dst"), false);
                 }
                 scratchpad_book(compute_arg_key);
+                if (wei_reorder_precalc) {
+                    // user-supplied weights contain precomputed ZP values, so
+                    // the buffer is to be passed to the conv alongside weights
+                    conv_info.register_user_arg(
+                            user_buf, user_arg_key, t.is_input && !t.is_output);
+                }
             }
             if (t.needs_zero_out) {
                 add_compute_arg(create_zero_out_info(), compute_buf, false);
@@ -512,6 +488,7 @@ class gen_convolution_t {
 
     std::vector<compute::kernel_t> kernels_;
     std::vector<compute::nd_range_t> nd_ranges_;
+    std::shared_ptr<impl::primitive_t> zp_prim_;
 };
 
 status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) {
diff --git a/src/gpu/intel/jit/conv/normalization.cpp b/src/gpu/intel/jit/conv/normalization.cpp
index 71dabfe229d..e1914e3eba3 100644
--- a/src/gpu/intel/jit/conv/normalization.cpp
+++ b/src/gpu/intel/jit/conv/normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -175,9 +175,11 @@ void maybe_reshape_dims(dim_idx_t ndims, layout_t &layout,
 // this method only gets called when ZP precompute is in order;
 // in all other cases ZPs are applied ad-hoc, without a post-op
 view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
-    auto map_o2k = [](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t KD,
-                           dim_t P, dim_t S) {
-        const bool needs_right_bound = ((O - 1) * S + (KD - P) >= I);
+    auto map_o2k = [this](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t K,
+                           dim_t D, dim_t P, dim_t S) {
+        const auto KD = (K - 1) * (D + 1) + 1;
+        const auto KDP = (KD > 1) ? KD - P : 0;
+        const bool needs_right_bound = (O - 1) * S + KDP >= I;
         expr_t o = v.vvars()[idx];
         if (KD >= I) {
             o = o * S;
@@ -186,7 +188,13 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
             dim_t off = P;
             if (P > 0) l = binary_op_t::make(op_kind_t::_min, o * S - P, 0);
             if (needs_right_bound) {
-                r = binary_op_t::make(op_kind_t::_max, o * S + (KD - P), I);
+                if (schedule_.var_bound(o) > O) {
+                    auto q = binary_op_t::make(
+                            op_kind_t::_min, o * S + KDP, (O - 1) * S + KDP);
+                    r = binary_op_t::make(op_kind_t::_max, q, I);
+                } else {
+                    r = binary_op_t::make(op_kind_t::_max, o * S + KDP, I);
+                }
                 off -= I;
             }
             o = (!l.is_empty()) ? l : o;
@@ -218,9 +226,6 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
     }
     dst = layout_t(dst.type(), dst.ndims(), dst.offset(), new_blk, false);
 
-    const auto KDD = (prb_.kd - 1) * (prb_.dd + 1) + 1;
-    const auto KDH = (prb_.kh - 1) * (prb_.dh + 1) + 1;
-    const auto KDW = (prb_.kw - 1) * (prb_.dw + 1) + 1;
     view_t view(vars, 6);
     view.set_vdim(vars[0], 1); // mb
     view.set_vdim(vars[1], prb_.g);
@@ -228,9 +233,9 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
     view.set_tdim(0, vars[0]);
     view.set_tdim(1, vars[1]);
     view.set_tdim(2, vars[2]);
-    map_o2k(view, 3, prb_.od, prb_.id, KDD, prb_.pd, prb_.sd);
-    map_o2k(view, 4, prb_.oh, prb_.ih, KDH, prb_.ph, prb_.sh);
-    map_o2k(view, 5, prb_.ow, prb_.iw, KDW, prb_.pw, prb_.sw);
+    map_o2k(view, 3, prb_.od, prb_.id, prb_.kd, prb_.dd, prb_.pd, prb_.sd);
+    map_o2k(view, 4, prb_.oh, prb_.ih, prb_.kh, prb_.dh, prb_.ph, prb_.sh);
+    map_o2k(view, 5, prb_.ow, prb_.iw, prb_.kw, prb_.dw, prb_.pw, prb_.sw);
     view.set_tlayout(dst);
     return view;
 }
diff --git a/src/gpu/intel/jit/conv/normalization.hpp b/src/gpu/intel/jit/conv/normalization.hpp
index 7e49a4c4a2c..cf926487376 100644
--- a/src/gpu/intel/jit/conv/normalization.hpp
+++ b/src/gpu/intel/jit/conv/normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,8 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t {
             const conv_problem_t &prb, const zero_points_config_t &zp_cfg,
             const layout_t &zp_dst)
         : post_op_view_mapper_t(schedule.c_view())
-        , has_external_src_zps_(zp_cfg.needs_src_precalc)
+        , has_external_src_zps_(zp_cfg.needs_src_conv_precalc
+                  || zp_cfg.needs_src_reorder_precalc)
         , schedule_(schedule)
         , prb_(prb)
         , zp_dst_(zp_dst) {}
diff --git a/src/gpu/intel/jit/conv/zp_plan.cpp b/src/gpu/intel/jit/conv/zp_plan.cpp
index f65eefcb28f..13f7aae303a 100644
--- a/src/gpu/intel/jit/conv/zp_plan.cpp
+++ b/src/gpu/intel/jit/conv/zp_plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -1446,7 +1446,6 @@ class zp_comp_apply_plan_t : public base_plan_t {
 
 struct zp_plan_impl_t : public base_plan_t {
     bool src_2d_loads = false;
-    bool needs_precalc = false;
     bool has_dpasw = false;
     split_dispatcher_t sd;
     send_plan_t load;
@@ -1526,8 +1525,9 @@ void zp_plan_t::init(const conv_config_t &cfg, bool src_2d_loads,
         const layout_t &wei_layout, const layout_t &dst_layout) {
     impl->src_2d_loads = src_2d_loads;
     impl->has_dpasw = cfg.fma_kind() == fma_kind_t::dpasw;
-    impl->needs_precalc = cfg.zp_cfg().needs_src_precalc;
-    bool do_src = cfg.zp_cfg().do_src_compensation && !impl->needs_precalc;
+    bool do_src = cfg.zp_cfg().do_src_compensation
+            && !cfg.zp_cfg().needs_src_reorder_precalc
+            && !cfg.zp_cfg().needs_src_conv_precalc;
     bool do_wei = cfg.zp_cfg().do_wei_compensation;
     send_plan_t impl_load;
 
@@ -1574,10 +1574,6 @@ bool zp_plan_t::has_zp_wei() const {
     return impl->has_zp_wei();
 }
 
-bool zp_plan_t::needs_precalc() const {
-    return impl->needs_precalc;
-}
-
 int zp_plan_t::load_reg_buf_size() const {
     return impl->load.reg_buf_size();
 }
diff --git a/src/gpu/intel/jit/conv/zp_plan.hpp b/src/gpu/intel/jit/conv/zp_plan.hpp
index 14ec03419fd..267de03422f 100644
--- a/src/gpu/intel/jit/conv/zp_plan.hpp
+++ b/src/gpu/intel/jit/conv/zp_plan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,6 @@ struct zp_plan_t : public base_plan_t {
     bool is_src_precomp_compatible() const;
     bool has_zp_src() const;
     bool has_zp_wei() const;
-    bool needs_precalc() const;
     int load_reg_buf_size() const;
     int mask_reg_buf_size() const;
     int comp_reg_buf_size() const;
diff --git a/src/gpu/intel/jit/ir/epilogue.cpp b/src/gpu/intel/jit/ir/epilogue.cpp
index d0f910e586b..315ce632dda 100644
--- a/src/gpu/intel/jit/ir/epilogue.cpp
+++ b/src/gpu/intel/jit/ir/epilogue.cpp
@@ -278,6 +278,9 @@ class post_op_tensor_t {
     stmt_t build_prefetch_stmt(const view_t &c_view) const {
         ir_assert(needs_load());
 
+        // Disable prefetching for precomputed ZPs stored at the end of 'wei'
+        if ((mem_buf().str() == "wei") || (mem_buf().str() == "wei_user"))
+            return stmt_t();
         auto prefetch = make_access_builder(*ir_ctx_, mem_view(), mem_buf(),
                 expr_t(), send_op_t::prefetch, send_address_t::a64,
                 get_cache_hint(c_view));
diff --git a/src/gpu/intel/jit/ir/kernel_info.hpp b/src/gpu/intel/jit/ir/kernel_info.hpp
index 86390264760..56700b72bf9 100644
--- a/src/gpu/intel/jit/ir/kernel_info.hpp
+++ b/src/gpu/intel/jit/ir/kernel_info.hpp
@@ -144,11 +144,11 @@ class kernel_info_t {
     // Returns stage ID, kernels with smaller stage IDs are executed first.
     int stage_id() const {
         switch (id()) {
-            case kernel_id_t::pre_reorder: return 0;
             case kernel_id_t::zero_out: return 0;
             case kernel_id_t::zp_precalc: return 1;
-            case kernel_id_t::convolution: return 2;
-            case kernel_id_t::post_reorder: return 3;
+            case kernel_id_t::pre_reorder: return 2;
+            case kernel_id_t::convolution: return 3;
+            case kernel_id_t::post_reorder: return 4;
             default: ir_error_not_expected();
         }
         return -1;
diff --git a/src/gpu/intel/jit/ir/post_ops.cpp b/src/gpu/intel/jit/ir/post_ops.cpp
index e10294ef5d1..a0dd20f7ad8 100644
--- a/src/gpu/intel/jit/ir/post_ops.cpp
+++ b/src/gpu/intel/jit/ir/post_ops.cpp
@@ -108,12 +108,27 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
 
     if (po_vm_.can_use_simple_src_zps() && zp_cfg.do_src_compensation) {
         if (zp_cfg.is_runtime_src_zero_points) {
-            bool per_oc = !zp_cfg.is_common_src_zero_point
-                    || zp_cfg.needs_src_precalc;
-            auto view = po_vm_.create_src_zp_view((per_oc) ? 1 << 1 : 0);
+            auto view = po_vm_.create_src_zp_view(
+                    (!zp_cfg.is_common_src_zero_point) ? 1 << 1 : 0);
             auto buf = kernel_info.find_arg("src_zero_points");
-            auto in = add_input_tensor(view, buf);
-            post_ops_.emplace_back(c, c - in);
+            if (zp_cfg.needs_src_reorder_precalc) {
+                auto wei = kernel_info.find_arg("wei_user", true);
+                if (wei.is_empty()) wei = kernel_info.find_arg("wei");
+
+                layout_t tlayout(view.tlayout());
+                tlayout.set_offset(
+                        utils::div_up(schedule.b_view().tlayout().size(),
+                                tlayout.type().size()));
+                view.set_tlayout(tlayout);
+                layout_t scalar(zp_cfg.src_zp_type, 0,
+                        std::vector<dim_t>(view.vvars().size(), 1), false);
+                auto zp = add_input_tensor(view_t(scalar, view.vvars()), buf);
+                auto in = add_input_tensor(view, wei);
+                post_ops_.emplace_back(c, c - in * zp);
+            } else {
+                auto in = add_input_tensor(view, buf);
+                post_ops_.emplace_back(c, c - in);
+            }
         } else {
             auto func = eltwise_t::make(alg_kind::eltwise_linear,
                     /*scale=*/1.f,
diff --git a/src/gpu/intel/jit/ir/post_ops.hpp b/src/gpu/intel/jit/ir/post_ops.hpp
index 84de8f0939d..72431dfc359 100644
--- a/src/gpu/intel/jit/ir/post_ops.hpp
+++ b/src/gpu/intel/jit/ir/post_ops.hpp
@@ -46,7 +46,8 @@ struct zero_points_config_t {
     bool is_common_src_zero_point = false;
     bool is_common_wei_zero_point = false;
     bool is_common_dst_zero_point = false;
-    bool needs_src_precalc = false;
+    bool needs_src_reorder_precalc = false;
+    bool needs_src_conv_precalc = false;
     int common_src_zero_point = 0;
     int common_wei_zero_point = 0;
     int common_dst_zero_point = 0;
@@ -75,8 +76,10 @@ struct zero_points_config_t {
                   pd && pd->attr()->zero_points_.common(DNNL_ARG_WEIGHTS))
         , is_common_dst_zero_point(
                   pd && pd->attr()->zero_points_.common(DNNL_ARG_DST))
-        , needs_src_precalc(
-                  pd && do_src_compensation && is_src_precalc_compatible(pd))
+        , needs_src_reorder_precalc(
+                  pd && do_src_compensation && can_use_src_reorder_precalc(pd))
+        , needs_src_conv_precalc(pd && do_src_compensation
+                  && !needs_src_reorder_precalc && can_use_src_conv_precalc(pd))
         , common_src_zero_point(0)
         , common_wei_zero_point(0)
         , common_dst_zero_point(0) {
@@ -102,12 +105,22 @@ struct zero_points_config_t {
     }
 
 private:
-    bool is_src_precalc_compatible(const primitive_desc_t *pd) {
+    bool can_use_src_reorder_precalc(const primitive_desc_t *pd) {
         if (pd->kind() != primitive_kind_t::dnnl_convolution) return false;
-        // In general, precomputed ZPs are slower than the regular ZPs up to a
-        // point where a nested convolution that does the precalc takes less
-        // time than the in-situ compensations; that usually happens around
-        // MB = 64, but the exact number is just a heuristic.
+        // Reorder-based precomputed ZPs are only available if the user did not
+        // specify the weights mem desc so the convolution can choose it freely
+        // and set a mem desc flag asking a reorder to precompute the values.
+        return (pd->invariant_wei_md()->format_kind == format_kind::any)
+                && pd->attr()->zero_points_.common(DNNL_ARG_SRC)
+                && pd->attr()->zero_points_.has_default_values(
+                        DNNL_ARG_WEIGHTS);
+    }
+    bool can_use_src_conv_precalc(const primitive_desc_t *pd) {
+        if (pd->kind() != primitive_kind_t::dnnl_convolution) return false;
+        // In general, conv-based precomputed ZPs are slower than the regular
+        // ZPs up to a point where a nested convolution that does the precalc
+        // takes less time than the in-situ compensations; that usually happens
+        // around MB = 64, but the exact number is just a heuristic.
         // TODO: a finer-grained estimate
         return (pd->invariant_src_md()->dims[0] >= 64)
                 && pd->attr()->zero_points_.has_default_values(
diff --git a/src/gpu/intel/jit/ir/tensor_config.cpp b/src/gpu/intel/jit/ir/tensor_config.cpp
index 20b8765df2b..a7c5e4f7c8d 100644
--- a/src/gpu/intel/jit/ir/tensor_config.cpp
+++ b/src/gpu/intel/jit/ir/tensor_config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,14 +38,14 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg,
                 /*is_input=*/true, /*is_output=*/false, zp_layout);
     };
     if (zp_cfg.do_src_compensation && zp_cfg.is_runtime_src_zero_points) {
-        if (!zp_cfg.needs_src_precalc) {
-            add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC,
-                    (zp_cfg.is_common_src_zero_point) ? 1 : ic);
-        } else {
+        if (zp_cfg.needs_src_conv_precalc) {
             ir_assert(zp_src);
             int arg_key = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
             tensor_cfg.add_tensor("src_zero_points", arg_key, /*is_input=*/true,
                     /*is_output=*/false, layout_t(zp_src, false), layout_t());
+        } else {
+            add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC,
+                    (zp_cfg.is_common_src_zero_point) ? 1 : ic);
         }
     }
     if (zp_cfg.do_wei_compensation && zp_cfg.is_runtime_wei_zero_points) {
diff --git a/src/gpu/intel/jit/reorder/gen_reorder.cpp b/src/gpu/intel/jit/reorder/gen_reorder.cpp
index 5f048447146..974b35210a8 100644
--- a/src/gpu/intel/jit/reorder/gen_reorder.cpp
+++ b/src/gpu/intel/jit/reorder/gen_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
             | sm::rounding_mode;
     VDISPATCH_REORDER(
             attr()->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR);
-    VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
+    VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
     VDISPATCH_REORDER(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
     VDISPATCH_REORDER(scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
 
@@ -148,6 +148,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
             check_layout(dst_layout), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
     VDISPATCH_REORDER(compute_engine->mayiuse_ngen_kernels(),
             VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels");
+
     auto *gpu_attr
             = utils::downcast<gpu_primitive_attr_t *>(attr()->gpu_attr_.get());
     hw_t hw(engine);
@@ -158,7 +159,8 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
     cfg->set_zp_cfg(zp_cfg);
     VDISPATCH_REORDER_SC(
             init_kernel_info(), "kernel initialization unsuccessful");
-
+    VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+            "failed to create nested zp precompute convolution");
     return status::success;
 }
 
@@ -202,6 +204,9 @@ status_t gen_reorder_t::pd_t::init_kernel_info() {
 }
 
 status_t gen_reorder_t::init(impl::engine_t *engine) {
+    CHECK(pd()->maybe_create_zp_precompute_conv(
+            zp_precomp_conv_, engine, this));
+
     auto &cfg = *pd()->cfg;
     auto &info = *pd()->kernel_info;
 
@@ -221,6 +226,7 @@ status_t gen_reorder_t::execute(const exec_ctx_t &ctx) const {
     info.set_args(arg_list, storage_list);
 
     CHECK(parallel_for(ctx, info.nd_range(), kernel_, arg_list));
+    CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_));
     return status::success;
 }
 
diff --git a/src/gpu/intel/jit/reorder/gen_reorder.hpp b/src/gpu/intel/jit/reorder/gen_reorder.hpp
index c6aa048dfb3..478d5e030a4 100644
--- a/src/gpu/intel/jit/reorder/gen_reorder.hpp
+++ b/src/gpu/intel/jit/reorder/gen_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ class gen_reorder_t : public gpu_primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
     compute::kernel_t kernel_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/ocl/ref_reorder.cpp b/src/gpu/intel/ocl/ref_reorder.cpp
index e058b7091e4..edebcb3d305 100644
--- a/src/gpu/intel/ocl/ref_reorder.cpp
+++ b/src/gpu/intel/ocl/ref_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -150,17 +150,19 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
     CHECK(large_parallel_for(
             ctx, nd_range, kernels_[0], arg_list, arg_list.nargs()));
 
-    if (!conf.subbyte_pack) return status::success;
-
-    compute::kernel_arg_list_t repack_arg_list;
-    repack_arg_list.set(0, *tmp);
-    repack_arg_list.set(1, dst);
-    repack_arg_list.set(2, into<dim_t>(conf.nelems));
-    repack_arg_list.set(3, 4);
-    compute::range_t repack_gws((conf.nelems * 4 + 7) / 8);
-    compute::nd_range_t repack_nd_range(repack_gws);
-    return large_parallel_for(
-            ctx, repack_nd_range, kernels_[1], repack_arg_list, 4);
+    if (conf.subbyte_pack) {
+        compute::kernel_arg_list_t repack_arg_list;
+        repack_arg_list.set(0, *tmp);
+        repack_arg_list.set(1, dst);
+        repack_arg_list.set(2, into<dim_t>(conf.nelems));
+        repack_arg_list.set(3, 4);
+        compute::range_t repack_gws((conf.nelems * 4 + 7) / 8);
+        compute::nd_range_t repack_nd_range(repack_gws);
+        CHECK(large_parallel_for(
+                ctx, repack_nd_range, kernels_[1], repack_arg_list, 4));
+    }
+    CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_));
+    return status::success;
 }
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_reorder.hpp b/src/gpu/intel/ocl/ref_reorder.hpp
index 7b312fad53c..7feed0402f7 100644
--- a/src/gpu/intel/ocl/ref_reorder.hpp
+++ b/src/gpu/intel/ocl/ref_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -121,8 +121,10 @@ struct ref_reorder_t : public gpu_primitive_t {
                     VERBOSE_UNSUPPORTED_DT_CFG);
 
             VDISPATCH_REORDER_SC(init_conf(engine), "init_conf()");
-            init_scratchpad();
+            VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+                    "failed to create nested zp precompute convolution");
 
+            init_scratchpad();
             return status::success;
         }
 
@@ -137,6 +139,9 @@ struct ref_reorder_t : public gpu_primitive_t {
     };
 
     status_t init(impl::engine_t *engine) override {
+        CHECK(pd()->maybe_create_zp_precompute_conv(
+                zp_precomp_conv_, engine, this));
+
         compute::kernel_ctx_t kernel_ctx;
 
         auto status = pd()->init_kernel_ctx(kernel_ctx);
@@ -161,6 +166,7 @@ struct ref_reorder_t : public gpu_primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::vector<compute::kernel_t> kernels_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace ocl