Skip to content

Commit

Permalink
gpu: sycl: sum: implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
t4c1 committed Jul 11, 2024
1 parent 0802ccf commit ef84374
Show file tree
Hide file tree
Showing 9 changed files with 594 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/common/dnnl_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ PKIND_TRAITS_INST(binary);
PKIND_TRAITS_INST(matmul);
PKIND_TRAITS_INST(resampling);
PKIND_TRAITS_INST(reduction);
PKIND_TRAITS_INST(sum);
PKIND_TRAITS_INST(sdpa);
#undef PKIND_TRAITS_INST

Expand Down
114 changes: 114 additions & 0 deletions src/gpu/generic/sycl/ref_sum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/generic/sycl/ref_sum.hpp"
#include "gpu/generic/sycl/sum_kernels.hpp"
#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {
namespace sycl {

status_t ref_sum_t::pd_t::init_conf() {
conf_ = sycl_sum_conf_t();
conf_.n = n_inputs();

for (auto i = 0; i < conf_.n; ++i) {
conf_.src_md[i] = xpu::sycl::md_t(src_md(i));
conf_.src_scales[i] = scales()[i];
}
conf_.dst_md = xpu::sycl::md_t(dst_md());

// XXX: should probably be tuned.
conf_.block_size = 16;
conf_.wg_size = 32;
conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
return status::success;
}

status_t ref_sum_t::init(engine_t *engine) {
const auto kid = ::sycl::get_kernel_id<sum_kernel_vec_t>();
CHECK(create_kernel(engine, kid, &kernel_));

return status::success;
}

status_t ref_sum_t::execute(const exec_ctx_t &ctx) const {
using namespace memory_tracking::names;

parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
auto src0_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 0);
auto src1_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 1);
auto src2_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 2);
auto src3_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 3);
auto src4_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 4);
auto src5_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 5);
auto src6_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 6);
auto src7_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 7);
auto src8_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 8);
auto src9_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 9);
auto src10_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 10);
auto src11_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 11);
auto src12_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 12);
auto src13_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 13);
auto src14_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 14);
auto src15_mem_arg
= CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MULTIPLE_SRC + 15);

auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);

sum_kernel_vec_t sum_kernel(pd()->conf_, src0_mem_arg, src1_mem_arg,
src2_mem_arg, src3_mem_arg, src4_mem_arg, src5_mem_arg,
src6_mem_arg, src7_mem_arg, src8_mem_arg, src9_mem_arg,
src10_mem_arg, src11_mem_arg, src12_mem_arg, src13_mem_arg,
src14_mem_arg, src15_mem_arg, dst_mem_arg);

const int block_size = pd()->conf_.block_size;
const int wg_size = pd()->conf_.wg_size;

const int t_work = pd()->conf_.wk_size;
const int wg_work = wg_size * block_size;
const int wg_cnt = utils::div_up(t_work, wg_work);

cgh.parallel_for(
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), sum_kernel);
});

return status::success;
}

} // namespace sycl
} // namespace generic
} // namespace gpu
} // namespace impl
} // namespace dnnl
97 changes: 97 additions & 0 deletions src/gpu/generic/sycl/ref_sum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REF_SUM_HPP
#define GPU_SYCL_REF_SUM_HPP

#include "common/primitive.hpp"
#include "common/stream.hpp"
#include "gpu/gpu_sum_pd.hpp"
#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
#include "gpu/generic/sycl/sycl_io_helper.hpp"
#include "gpu/generic/sycl/sycl_post_ops.hpp"
#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
#include "gpu/generic/sycl/sycl_q10n.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {
namespace sycl {

struct ref_sum_t : public gpu::generic::sycl::primitive_t {
using gpu::generic::sycl::primitive_t::primitive_t;

struct pd_t : public gpu_sum_pd_t {
using gpu_sum_pd_t::gpu_sum_pd_t;

DECLARE_SUM_PD_T("dpcpp:ref:any", ref_sum_t);

status_t init(impl::engine_t *engine) {
using namespace data_type;
using namespace format_tag;

const memory_desc_wrapper dst_d(dst_md());
if (!utils::one_of(dst_d.data_type(), f32, bf16, f16, s8, u8))
return status::unimplemented;
// Block formats are not yet supported
// Dimensions can not be > 6
if (!dst_d.is_plain() || dst_d.ndims() > MAX_NDIMS)
return status::unimplemented;

const int n = n_inputs();
for (auto i = 0; i < n; ++i) {
const memory_desc_wrapper src_d(src_md(i));
if (!utils::one_of(src_d.data_type(), f32, bf16, f16, s8, u8))
return status::unimplemented;
// Block formats are not yet supported
// Dimensions can not be > 6
if (!src_d.is_plain() || src_d.ndims() > MAX_NDIMS)
return status::unimplemented;
}

const bool ok = set_default_params() == status::success
&& n <= MAX_NUM_TENSORS;
if (!ok) return status::unimplemented;

return init_conf();
}

sycl_sum_conf_t conf_;

private:
status_t init_conf();

inline bool equal(float in_value, float in_compare_to) {
return std::fabs(in_value - in_compare_to) <= FLT_EPSILON;
}
};

status_t init(impl::engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
kernel_t kernel_;
};

} // namespace sycl
} // namespace generic
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
79 changes: 79 additions & 0 deletions src/gpu/generic/sycl/ref_sum_many_inputs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*******************************************************************************
* Copyright 2022-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/generic/sycl/ref_sum_many_inputs.hpp"
#include "common/primitive_desc_iface.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {
namespace sycl {

status_t ref_sum_many_inputs_t::pd_t::init_conf() {
conf_ = sycl_sum_conf_t();
conf_.n = n_inputs();

return status::success;
}

status_t ref_sum_many_inputs_t::init(engine_t *engine) {
const size_t n = pd()->base_pds_.size();
base_prims_.resize(n);
for (size_t i = 0; i < n; ++i) {
CHECK(pd()->base_pds_[i]->impl()->create_primitive(
base_prims_[i], engine, cache_blob()));
}

return status::success;
}

status_t ref_sum_many_inputs_t::execute(const exec_ctx_t &ctx) const {
memory_arg_t dst_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, false};
memory_arg_t dst_read_mem_arg = {ctx.args().at(DNNL_ARG_DST).mem, true};

int n_remaining = pd()->conf_.n;
int in_arg_offset = 0;
int i = 0;

while (n_remaining > 0) {
bool pass_in_dst = i > 0;
int max_n_child_inputs = MAX_NUM_TENSORS - pass_in_dst;
int args_handled = std::min(n_remaining, max_n_child_inputs);
exec_args_t r_args;
r_args[DNNL_ARG_DST] = dst_mem_arg;
if (pass_in_dst) {
r_args[DNNL_ARG_MULTIPLE_SRC + 0] = dst_read_mem_arg;
}
for (int j = 0; j < args_handled; j++) {
r_args[DNNL_ARG_MULTIPLE_SRC + j + pass_in_dst]
= ctx.args().at(DNNL_ARG_MULTIPLE_SRC + j + in_arg_offset);
}
n_remaining -= args_handled;
in_arg_offset += args_handled;
i++;

exec_ctx_t r_ctx(ctx, std::move(r_args));
CHECK(base_prims_[i]->execute(r_ctx));
}
return status::success;
}

} // namespace sycl
} // namespace generic
} // namespace gpu
} // namespace impl
} // namespace dnnl
Loading

0 comments on commit ef84374

Please sign in to comment.