gpu: sycl: sum: implemented

oneapi-src · Jul 11, 2024 · ef84374 · ef84374
1 parent 0802ccf
commit ef84374
Show file tree

Hide file tree

Showing 9 changed files with 594 additions and 0 deletions.
diff --git a/src/common/dnnl_traits.hpp b/src/common/dnnl_traits.hpp
@@ -174,6 +174,7 @@ PKIND_TRAITS_INST(binary);
 PKIND_TRAITS_INST(matmul);
 PKIND_TRAITS_INST(resampling);
 PKIND_TRAITS_INST(reduction);
+PKIND_TRAITS_INST(sum);
 PKIND_TRAITS_INST(sdpa);
 #undef PKIND_TRAITS_INST
 

diff --git a/src/gpu/generic/sycl/ref_sum.cpp b/src/gpu/generic/sycl/ref_sum.cpp
@@ -0,0 +1,114 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_sum.hpp"
+#include "gpu/generic/sycl/sum_kernels.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+status_t ref_sum_t::pd_t::init_conf() {
+    conf_ = sycl_sum_conf_t();
+    conf_.n = n_inputs();
+
+    for (auto i = 0; i < conf_.n; ++i) {
+        conf_.src_md[i] = xpu::sycl::md_t(src_md(i));
+        conf_.src_scales[i] = scales()[i];
+    }
+    conf_.dst_md = xpu::sycl::md_t(dst_md());
+
+    // XXX: should probably be tuned.
+    conf_.block_size = 16;
+    conf_.wg_size = 32;
+    conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
+    return status::success;
+}
+
+status_t ref_sum_t::init(engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<sum_kernel_vec_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+
+    return status::success;
+}
+
+status_t ref_sum_t::execute(const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        auto src0_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 0);
+        auto src1_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 1);
+        auto src2_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 2);
+        auto src3_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 3);
+        auto src4_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 4);
+        auto src5_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 5);
+        auto src6_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 6);
+        auto src7_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 7);
+        auto src8_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 8);
+        auto src9_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 9);
+        auto src10_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 10);
+        auto src11_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 11);
+        auto src12_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 12);
+        auto src13_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 13);
+        auto src14_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 14);
+        auto src15_mem_arg
+                = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 15);
+
+        auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+
+        sum_kernel_vec_t sum_kernel(pd()->conf_, src0_mem_arg, src1_mem_arg,
+                src2_mem_arg, src3_mem_arg, src4_mem_arg, src5_mem_arg,
+                src6_mem_arg, src7_mem_arg, src8_mem_arg, src9_mem_arg,
+                src10_mem_arg, src11_mem_arg, src12_mem_arg, src13_mem_arg,
+                src14_mem_arg, src15_mem_arg, dst_mem_arg);
+
+        const int block_size = pd()->conf_.block_size;
+        const int wg_size = pd()->conf_.wg_size;
+
+        const int t_work = pd()->conf_.wk_size;
+        const int wg_work = wg_size * block_size;
+        const int wg_cnt = utils::div_up(t_work, wg_work);
+
+        cgh.parallel_for(
+                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), sum_kernel);
+    });
+
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/ref_sum.hpp b/src/gpu/generic/sycl/ref_sum.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_SYCL_REF_SUM_HPP
+#define GPU_SYCL_REF_SUM_HPP
+
+#include "common/primitive.hpp"
+#include "common/stream.hpp"
+#include "gpu/gpu_sum_pd.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_q10n.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct ref_sum_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_sum_pd_t {
+        using gpu_sum_pd_t::gpu_sum_pd_t;
+
+        DECLARE_SUM_PD_T("dpcpp:ref:any", ref_sum_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using namespace format_tag;
+
+            const memory_desc_wrapper dst_d(dst_md());
+            if (!utils::one_of(dst_d.data_type(), f32, bf16, f16, s8, u8))
+                return status::unimplemented;
+            // Block formats are not yet supported
+            // Dimensions can not be > 6
+            if (!dst_d.is_plain() || dst_d.ndims() > MAX_NDIMS)
+                return status::unimplemented;
+
+            const int n = n_inputs();
+            for (auto i = 0; i < n; ++i) {
+                const memory_desc_wrapper src_d(src_md(i));
+                if (!utils::one_of(src_d.data_type(), f32, bf16, f16, s8, u8))
+                    return status::unimplemented;
+                // Block formats are not yet supported
+                // Dimensions can not be > 6
+                if (!src_d.is_plain() || src_d.ndims() > MAX_NDIMS)
+                    return status::unimplemented;
+            }
+
+            const bool ok = set_default_params() == status::success
+                    && n <= MAX_NUM_TENSORS;
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_sum_conf_t conf_;
+
+    private:
+        status_t init_conf();
+
+        inline bool equal(float in_value, float in_compare_to) {
+            return std::fabs(in_value - in_compare_to) <= FLT_EPSILON;
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/ref_sum_many_inputs.cpp b/src/gpu/generic/sycl/ref_sum_many_inputs.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+* Copyright 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_sum_many_inputs.hpp"
+#include "common/primitive_desc_iface.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+status_t ref_sum_many_inputs_t::pd_t::init_conf() {
+    conf_ = sycl_sum_conf_t();
+    conf_.n = n_inputs();
+
+    return status::success;
+}
+
+status_t ref_sum_many_inputs_t::init(engine_t *engine) {
+    const size_t n = pd()->base_pds_.size();
+    base_prims_.resize(n);
+    for (size_t i = 0; i < n; ++i) {
+        CHECK(pd()->base_pds_[i]->impl()->create_primitive(
+                base_prims_[i], engine, cache_blob()));
+    }
+
+    return status::success;
+}
+
+status_t ref_sum_many_inputs_t::execute(const exec_ctx_t &ctx) const {
+    memory_arg_t dst_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, false};
+    memory_arg_t dst_read_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, true};
+
+    int n_remaining = pd()->conf_.n;
+    int in_arg_offset = 0;
+    int i = 0;
+
+    while (n_remaining > 0) {
+        bool pass_in_dst = i > 0;
+        int max_n_child_inputs = MAX_NUM_TENSORS - pass_in_dst;
+        int args_handled = std::min(n_remaining, max_n_child_inputs);
+        exec_args_t r_args;
+        r_args[DNNL_ARG_DST] = dst_mem_arg;
+        if (pass_in_dst) {
+            r_args[DNNL_ARG_MULTIPLE_SRC + 0] = dst_read_mem_arg;
+        }
+        for (int j = 0; j < args_handled; j++) {
+            r_args[DNNL_ARG_MULTIPLE_SRC + j + pass_in_dst]
+                    = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + j + in_arg_offset);
+        }
+        n_remaining -= args_handled;
+        in_arg_offset += args_handled;
+        i++;
+
+        exec_ctx_t r_ctx(ctx, std::move(r_args));
+        CHECK(base_prims_[i]->execute(r_ctx));
+    }
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl