Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 58 additions & 13 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
cl_kernel kernel_mul_mat_f32_f32;
cl_kernel kernel_mul_mat_f16_f16;
cl_kernel kernel_mul_mat_f16_f32_1row;
Expand Down Expand Up @@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
cl_kernel kernel_concat_f32;
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
cl_kernel kernel_conv_2d_f16;
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
Expand Down Expand Up @@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
GGML_LOG_CONT(".");
}
Expand Down Expand Up @@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
Expand Down Expand Up @@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
nth *= 2;
}

size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
int nchunks = 1;
if (src0->type == GGML_TYPE_F32) {
const int chunk_target = nth * 4;
nchunks = (ne00 + chunk_target - 1) / chunk_target;
nchunks = MAX(1, MIN(nchunks, 64));
}

size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {(size_t)nth, 1, 1};

backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
Expand Down Expand Up @@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con

int nth = MIN(64, ne0);

cl_kernel kernel = backend_ctx->kernel_concat_f32;
const bool concat_pack = (dim == 0 && ne0 < 32);
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
: backend_ctx->kernel_concat_f32;

CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
Expand All @@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));

size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};
if (concat_pack) {
// packed kernel needs the dst dims to unflatten its 1-D row index.
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));

const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne0, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw;
const int nrows = ne1*ne2*ne3;
const int nwg = (nrows + rpw - 1) / rpw;
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
} else {
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};

backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}

static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
Expand Down Expand Up @@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = 4;
ndst = 16;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
Expand Down Expand Up @@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_cpy_f32_f16;
break;
case GGML_TYPE_F32:
kernel = backend_ctx->kernel_cpy_f32_f32;
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
: backend_ctx->kernel_cpy_f32_f32;
break;
default:
GGML_ASSERT(false && "not implemented");
Expand Down Expand Up @@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));

const int nth = MIN(64, ne00);
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne00, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw; // <= base <= maxwg
const int nrows = ne01*ne02*ne03;
const int nwg = (nrows + rpw - 1) / rpw;

size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};

backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
} else {
const int nth = MIN(64, ne00);

backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};

backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
}
}

static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
Expand Down
67 changes: 67 additions & 0 deletions ggml/src/ggml-opencl/kernels/concat.cl
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
*y = *x;
}
}

kernel void kernel_concat_f32_pack(
global const char * src0,
ulong offset0,
global const char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3,
int dim,
int ne1,
int ne2,
int ne3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;

int lsz = get_local_size(0);
int tpr = min(ne0, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;

int nrows = ne1*ne2*ne3;
if (row >= nrows) {
return;
}

int i1 = row % ne1;
int t = row / ne1;
int i2 = t % ne2;
int i3 = t / ne2;

int o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));

for (int i0 = lane; i0 < ne0; i0 += tpr) {
global const float * x;
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
} else {
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
}

global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);

*y = *x;
}
}
59 changes: 59 additions & 0 deletions ggml/src/ggml-opencl/kernels/cpy.cl
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
}
}

kernel void kernel_cpy_f32_f32_pack(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);

int lsz = get_local_size(0);
int tpr = min(ne00, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;

int nrows = ne01*ne02*ne03;
if (row >= nrows) {
return;
}

int i01 = row % ne01;
int t = row / ne01;
int i02 = t % ne02;
int i03 = t / ne02;

// linear index of the first element of this row, unflattened over dst dims
long n = (long)row * ne00;
int i3 = (int)(n / ((long)ne2*ne1*ne0));
long rm = n - (long)i3*ne2*ne1*ne0;
int i2 = (int)(rm / ((long)ne1*ne0));
rm -= (long)i2*ne1*ne0;
int i1 = (int)(rm / ne0);
int i0 = (int)(rm - (long)i1*ne0);

global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);

for (int i00 = lane; i00 < ne00; i00 += tpr) {
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0];
}
}

kernel void kernel_cpy_i32_i32(
global int * src0,
ulong offset0,
Expand Down
24 changes: 15 additions & 9 deletions ggml/src/ggml-opencl/kernels/get_rows.cl
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
src1 = (global int*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);

int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int nchunks = get_num_groups(0) / ne10;
int g = get_group_id(0);
int i10 = g / nchunks;
int chunk = g - i10 * nchunks;
int i11 = get_group_id(1);
int i12 = get_group_id(2);

int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];

int i02 = i11;
int i03 = i12;

for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
if (ind >= ne00) {
return;
}
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);

int span = (ne00 + nchunks - 1) / nchunks;
int start = chunk * span;
int end = min(start + span, ne00);

for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
dst_row[ind] = src_row[ind];
}
}

Expand Down
Loading
Loading