Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Legend:
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
Expand Down
16,266 changes: 8,133 additions & 8,133 deletions docs/ops/SYCL.csv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions ggml/src/ggml-cuda/arange.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);

float start;
float stop;
float step;
float stop;
float step;
memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
memcpy(&stop, (float *)dst->op_params + 1, sizeof(float));
memcpy(&step, (float *)dst->op_params + 2, sizeof(float));
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include "ggml-sycl.h"
#include "presets.hpp"
#include "sycl_hw.hpp"

#include "ggml-sycl/dpct/helper.hpp"

#if GGML_SYCL_DNNL
#include "dnnl.hpp"
Expand Down
190 changes: 159 additions & 31 deletions ggml/src/ggml-sycl/element_wise.cpp

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions ggml/src/ggml-sycl/element_wise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "common.hpp"
#include "ggml.h"
#include "ggml-sycl/backend.hpp"
#include <limits> // For std::numeric_limits

template <typename T>
Expand Down Expand Up @@ -83,4 +84,6 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

#endif // GGML_SYCL_ELEMENTWISE_HPP
100 changes: 49 additions & 51 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <regex>
#include "common.hpp"

#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
Expand Down Expand Up @@ -1746,27 +1747,25 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
const size_t shared_mem = ncols_pad * sizeof(int);

if (order == GGML_SORT_ORDER_ASC) {
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
sycl::range<1>(shared_mem), cgh);

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
x, dst, ncols, ncols_pad, item_ct1,
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
.get());
});
});
} else if (order == GGML_SORT_ORDER_DESC) {
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
sycl::range<1>(shared_mem), cgh);

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
x, dst, ncols, ncols_pad, item_ct1,
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
Expand All @@ -1784,50 +1783,47 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
const sycl::range<3> block_nums(1, nrows, 1);
const size_t shared_mem = 256 * sizeof(float);

stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<float, 1> shared_data(
sycl::range<1>(shared_mem/sizeof(float)), cgh);
sycl::local_accessor<int, 1> shared_indices(
sycl::range<1>(shared_mem/sizeof(float)), cgh);

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
const int tid = item_ct1.get_local_id(2);
const int row = item_ct1.get_global_id(1);

float max_val = -INFINITY;
int max_idx = -1;

for (int col = tid; col < ncols; col += 256) {
float val = x[row * ncols + col];
if (val > max_val) {
max_val = val;
max_idx = col;
}
}
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
const int tid = item_ct1.get_local_id(2);
const int row = item_ct1.get_global_id(1);

shared_data[tid] = max_val;
shared_indices[tid] = max_idx;
item_ct1.barrier(sycl::access::fence_space::local_space);
float max_val = -INFINITY;
int max_idx = -1;

for (int stride = 256/2; stride > 0; stride >>= 1) {
if (tid < stride) {
float val1 = shared_data[tid];
float val2 = shared_data[tid + stride];
if (val2 > val1) {
shared_data[tid] = val2;
shared_indices[tid] = shared_indices[tid + stride];
}
}
item_ct1.barrier(sycl::access::fence_space::local_space);
for (int col = tid; col < ncols; col += 256) {
float val = x[row * ncols + col];
if (val > max_val) {
max_val = val;
max_idx = col;
}
}

shared_data[tid] = max_val;
shared_indices[tid] = max_idx;
item_ct1.barrier(sycl::access::fence_space::local_space);

if (tid == 0) {
dst[row] = shared_indices[0];
for (int stride = 256 / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
float val1 = shared_data[tid];
float val2 = shared_data[tid + stride];
if (val2 > val1) {
shared_data[tid] = val2;
shared_indices[tid] = shared_indices[tid + stride];
}
}
});
item_ct1.barrier(sycl::access::fence_space::local_space);
}

if (tid == 0) {
dst[row] = shared_indices[0];
}
});
});
}
static void diag_mask_inf_f32_sycl(const float *x, float *dst,
Expand Down Expand Up @@ -2900,7 +2896,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
void ** ptrs_dst_get = ptrs_dst.get();
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
});
Expand Down Expand Up @@ -3408,7 +3404,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
{
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 0> src1_row_acc(cgh);

char *__restrict src1_contiguous_get =
Expand All @@ -3420,9 +3416,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
size_t ids_nb_ct6 = ids->nb[1];
size_t ids_nb_ct7 = ids->nb[0];

cgh.parallel_for(
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_copy_src1_to_contiguous(
src1_original, src1_contiguous_get,
dev_cur_src1_row_get,
Expand Down Expand Up @@ -3453,15 +3448,14 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
{
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
sycl::range<3> grid_dims(1, 1, num_src1_rows);
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
const char *__restrict dst_contiguous_get =
dst_contiguous.get();
const mmid_row_mapping *__restrict dev_row_mapping_get =
dev_row_mapping.get();

cgh.parallel_for(
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_copy_dst_from_contiguous(dst_original,
dst_contiguous_get,
dev_row_mapping_get,
Expand Down Expand Up @@ -3676,6 +3670,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_PAD:
ggml_sycl_pad(ctx, dst);
break;
case GGML_OP_ARANGE:
ggml_sycl_arange(ctx, dst);
break;
case GGML_OP_LEAKY_RELU:
ggml_sycl_leaky_relu(ctx, dst);
break;
Expand Down Expand Up @@ -4070,7 +4067,6 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .event_record = */ ggml_backend_sycl_event_record,
/* .event_wait = */ ggml_backend_sycl_event_wait,
/* .optimize_graph = */ NULL,
};

static ggml_guid_t ggml_backend_sycl_guid() {
Expand Down Expand Up @@ -4164,6 +4160,8 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_

static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_ARANGE:
return true;
case GGML_OP_CONV_TRANSPOSE_1D:
{
ggml_type src0_type = op->src[0]->type;
Expand Down
20 changes: 5 additions & 15 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6050,9 +6050,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2});
add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2});

// test case for k_bin_bcast_unravel in CUDA backend
add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1});

// stable diffusion
add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1});
add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});
Expand Down Expand Up @@ -6394,7 +6391,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (int64_t ne1 : {16, 1024}) {
test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 1, 1}, scale, max_bias));
test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias));
test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 2, 3}, scale, max_bias));
}
}
}
Expand Down Expand Up @@ -6497,6 +6493,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_pad_reflect_1d());
test_cases.emplace_back(new test_roll());
test_cases.emplace_back(new test_arange());
test_cases.emplace_back(new test_arange(GGML_TYPE_F32, 0, 1, 1000));
test_cases.emplace_back(new test_arange(GGML_TYPE_F32, -5, 0.5, 20));
test_cases.emplace_back(new test_arange(GGML_TYPE_F64, 0, 1e-3, 100000));
test_cases.emplace_back(new test_arange(GGML_TYPE_F16, 5, 1, -1));
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu());

Expand Down Expand Up @@ -6810,17 +6810,7 @@ static void list_all_ops() {
static void show_test_coverage() {
std::set<std::string> all_ops;
for (int i = 1; i < GGML_OP_COUNT; i++) {
auto op = (enum ggml_op)i;
if (op == GGML_OP_VIEW ||
op == GGML_OP_RESHAPE ||
op == GGML_OP_PERMUTE ||
op == GGML_OP_TRANSPOSE ||
op == GGML_OP_CONT ||
op == GGML_OP_GLU ||
op == GGML_OP_UNARY) {
continue;
}
all_ops.insert(ggml_op_name(op));
all_ops.insert(ggml_op_name((enum ggml_op)i));
}
for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));
Expand Down