33#include < array>
44#include < atomic>
55#include < sstream>
6+ #include < vector>
67
78#define CL_TARGET_OPENCL_VERSION 110
89#include < clblast.h>
@@ -197,6 +198,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197198}
198199);
199200
201+ std::string mul_template = MULTILINE_QUOTE(
202+ __kernel void KERNEL_NAME (__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
203+ const int i = get_group_id (0 )*get_local_size (0 ) + get_local_id (0 );
204+
205+ if (i >= get_global_size (0 )) {
206+ return ;
207+ }
208+
209+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
210+ }
211+ );
212+
200213#define CL_CHECK (err ) \
201214 do { \
202215 cl_int err_ = (err); \
@@ -239,6 +252,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239252 " convert_mul_mat_vec_f16" , " half" , " 1" , " 1" , " convert_f16"
240253};
241254
255+ std::array<std::string, 2 > mul_str_keys = {
256+ " KERNEL_NAME" , " TYPE"
257+ };
258+ std::array<std::string, 2 > mul_str_values = {
259+ " mul_f32" , " float"
260+ };
261+
242262std::string& replace (std::string& s, const std::string& from, const std::string& to) {
243263 size_t pos = 0 ;
244264 while ((pos = s.find (from, pos)) != std::string::npos) {
@@ -261,6 +281,13 @@ std::string generate_kernels() {
261281 src << dequant_kernel << ' \n ' ;
262282 src << dmmv_kernel << ' \n ' ;
263283 }
284+ for (size_t i = 0 ; i < mul_str_values.size (); i += mul_str_keys.size ()) {
285+ std::string mul_kernel = mul_template;
286+ for (size_t j = 0 ; j < mul_str_keys.size (); j++) {
287+ replace (mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
288+ }
289+ src << mul_kernel << ' \n ' ;
290+ }
264291 return src.str ();
265292}
266293
@@ -272,6 +299,7 @@ static cl_program program;
272299static cl_kernel convert_row_f16_cl;
273300static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274301static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
302+ static cl_kernel mul_f32_cl;
275303static bool fp16_support;
276304
277305static cl_program build_program_from_source (cl_context ctx, cl_device_id dev, const char * program_buffer) {
@@ -508,6 +536,9 @@ void ggml_cl_init(void) {
508536 CL_CHECK ((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel (program, " dequantize_mul_mat_vec_q5_1" , &err), err));
509537 CL_CHECK ((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel (program, " dequantize_mul_mat_vec_q8_0" , &err), err));
510538 CL_CHECK ((convert_mul_mat_vec_f16_cl = clCreateKernel (program, " convert_mul_mat_vec_f16" , &err), err));
539+
540+ // mul kernel
541+ CL_CHECK ((mul_f32_cl = clCreateKernel (program, " mul_f32" , &err), err));
511542}
512543
513544static cl_kernel* ggml_get_to_fp32_cl (ggml_type type) {
@@ -644,6 +675,98 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
644675 return err;
645676}
646677
678+ static void ggml_cl_mul_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
679+ GGML_ASSERT (src1->backend == GGML_BACKEND_CL);
680+ const int64_t ne00 = src0->ne [0 ];
681+ const int64_t ne01 = src0->ne [1 ];
682+ const int64_t ne02 = src0->ne [2 ];
683+ const int64_t ne03 = src0->ne [2 ];
684+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
685+ const int64_t ne10 = src1->ne [0 ];
686+ const int64_t ne11 = src1->ne [1 ];
687+ const int64_t ne12 = src1->ne [2 ];
688+ const int64_t ne13 = src1->ne [3 ];
689+ const int64_t nb10 = src1->nb [0 ];
690+ const int nb2 = dst->nb [2 ];
691+ const int nb3 = dst->nb [3 ];
692+ size_t x_size;
693+ size_t d_size;
694+
695+ cl_mem d_X = ggml_cl_pool_malloc (ne0 * sizeof (float ), &x_size, CL_MEM_READ_ONLY); // src0
696+ cl_mem d_Y = (cl_mem) src1->data ; // src1 is already on device, broadcasted.
697+ cl_mem d_D = ggml_cl_pool_malloc (ne0 * sizeof (float ), &d_size, CL_MEM_WRITE_ONLY); // dst
698+
699+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
700+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
701+ const int i0 = i03*ne02 + i02;
702+
703+ cl_event ev;
704+
705+ // copy src0 to device
706+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, i0, src0, i03, i02, &ev));
707+
708+ if (nb10 == sizeof (float )) {
709+ // Contiguous, avoid overhead from queueing many kernel runs
710+ const int64_t i13 = i03%ne13;
711+ const int64_t i12 = i02%ne12;
712+ const int i1 = i13*ne12*ne11 + i12*ne11;
713+
714+ cl_int x_offset = 0 ;
715+ cl_int y_offset = i1*ne10;
716+ cl_int d_offset = 0 ;
717+
718+ size_t global = ne00 * ne01;
719+ cl_int ky = ne10;
720+ CL_CHECK (clSetKernelArg (mul_f32_cl, 0 , sizeof (cl_mem), &d_X));
721+ CL_CHECK (clSetKernelArg (mul_f32_cl, 1 , sizeof (cl_int), &x_offset));
722+ CL_CHECK (clSetKernelArg (mul_f32_cl, 2 , sizeof (cl_mem), &d_Y));
723+ CL_CHECK (clSetKernelArg (mul_f32_cl, 3 , sizeof (cl_int), &y_offset));
724+ CL_CHECK (clSetKernelArg (mul_f32_cl, 4 , sizeof (cl_mem), &d_D));
725+ CL_CHECK (clSetKernelArg (mul_f32_cl, 5 , sizeof (cl_int), &d_offset));
726+ CL_CHECK (clSetKernelArg (mul_f32_cl, 6 , sizeof (cl_int), &ky));
727+ CL_CHECK (clEnqueueNDRangeKernel (queue, mul_f32_cl, 1 , NULL , &global, NULL , 1 , &ev, NULL ));
728+ } else {
729+ for (int64_t i01 = 0 ; i01 < ne01; i01++) {
730+ const int64_t i13 = i03%ne13;
731+ const int64_t i12 = i02%ne12;
732+ const int64_t i11 = i01%ne11;
733+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
734+
735+ cl_int x_offset = i01*ne00;
736+ cl_int y_offset = i1*ne10;
737+ cl_int d_offset = i01*ne00;
738+
739+ // compute
740+ size_t global = ne00;
741+ cl_int ky = ne10;
742+ CL_CHECK (clSetKernelArg (mul_f32_cl, 0 , sizeof (cl_mem), &d_X));
743+ CL_CHECK (clSetKernelArg (mul_f32_cl, 1 , sizeof (cl_int), &x_offset));
744+ CL_CHECK (clSetKernelArg (mul_f32_cl, 2 , sizeof (cl_mem), &d_Y));
745+ CL_CHECK (clSetKernelArg (mul_f32_cl, 3 , sizeof (cl_int), &y_offset));
746+ CL_CHECK (clSetKernelArg (mul_f32_cl, 4 , sizeof (cl_mem), &d_D));
747+ CL_CHECK (clSetKernelArg (mul_f32_cl, 5 , sizeof (cl_int), &d_offset));
748+ CL_CHECK (clSetKernelArg (mul_f32_cl, 6 , sizeof (cl_int), &ky));
749+ CL_CHECK (clEnqueueNDRangeKernel (queue, mul_f32_cl, 1 , NULL , &global, NULL , 1 , &ev, NULL ));
750+ }
751+ }
752+
753+ CL_CHECK (clReleaseEvent (ev));
754+ CL_CHECK (clFinish (queue));
755+
756+ // copy dst to host
757+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
758+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * ne00*ne01, d, 0 , NULL , NULL ));
759+ }
760+ }
761+ ggml_cl_pool_free (d_X, x_size);
762+ ggml_cl_pool_free (d_D, d_size);
763+ }
764+
765+ void ggml_cl_mul (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
766+ GGML_ASSERT (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
767+ ggml_cl_mul_f32 (src0, src1, dst);
768+ }
769+
647770static void ggml_cl_mul_mat_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
648771 const int64_t ne00 = src0->ne [0 ];
649772 const int64_t ne01 = src0->ne [1 ];
@@ -860,44 +983,48 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
860983 cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl (type);
861984 GGML_ASSERT (to_fp32_cl != nullptr );
862985
986+ size_t ev_idx = 0 ;
987+ std::vector<cl_event> events;
988+
863989 for (int64_t i03 = 0 ; i03 < ne03; i03++) {
864990 for (int64_t i02 = 0 ; i02 < ne02; i02++) {
865- cl_event ev_sgemm;
866-
867991 // copy src0 to device if necessary
868992 if (src0->backend == GGML_BACKEND_CPU) {
869- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, NULL ));
993+ events.emplace_back ();
994+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
870995 } else if (src0->backend == GGML_BACKEND_CL) {
871996 d_Q = (cl_mem) src0->data ;
872997 } else {
873998 GGML_ASSERT (false );
874999 }
8751000 if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
8761001 // copy src1 to device
877- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, NULL ));
1002+ events.emplace_back ();
1003+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, events.data () + ev_idx++));
8781004
8791005 // compute
8801006 const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
8811007 const size_t local = CL_DMMV_BLOCK_SIZE;
8821008 const cl_int ncols = ne00;
1009+ events.emplace_back ();
8831010 CL_CHECK (clSetKernelArg (*dmmv, 0 , sizeof (cl_mem), &d_Q));
8841011 CL_CHECK (clSetKernelArg (*dmmv, 1 , sizeof (float ) * local, NULL ));
8851012 CL_CHECK (clSetKernelArg (*dmmv, 2 , sizeof (cl_mem), &d_Y));
8861013 CL_CHECK (clSetKernelArg (*dmmv, 3 , sizeof (cl_mem), &d_D));
8871014 CL_CHECK (clSetKernelArg (*dmmv, 4 , sizeof (cl_int), &ncols));
888- CL_CHECK (clFinish (queue));
889- CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , NULL , &global, &local, 0 , NULL , &ev_sgemm));
1015+ CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , NULL , &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
8901016 } else { // general dequantization kernel + CLBlast matrix matrix multiplication
8911017 // convert src0 to fp32 on device
8921018 const size_t global = x_ne;
8931019 CL_CHECK (clSetKernelArg (*to_fp32_cl, 0 , sizeof (cl_mem), &d_Q));
8941020 CL_CHECK (clSetKernelArg (*to_fp32_cl, 1 , sizeof (cl_mem), &d_X));
895- CL_CHECK (clFinish (queue));
896- CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, NULL , 0 , NULL , NULL ));
1021+ CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
8971022
8981023 // copy src1 to device
8991024 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, NULL ));
9001025
1026+ events.emplace_back ();
1027+
9011028 // wait for conversion
9021029 CL_CHECK (clFinish (queue));
9031030
@@ -910,7 +1037,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
9101037 d_Y, 0 , ne10,
9111038 beta,
9121039 d_D, 0 , ne01,
913- &queue, &ev_sgemm );
1040+ &queue, events. data () + ev_idx++ );
9141041
9151042 if (status != clblast::StatusCode::kSuccess ) {
9161043 GGML_ASSERT (false );
@@ -919,8 +1046,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
9191046
9201047 // copy dst to host
9211048 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
922- CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
923- clReleaseEvent (ev_sgemm);
1049+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1050+ for (auto *event : events) {
1051+ clReleaseEvent (event);
1052+ }
1053+
1054+ ev_idx = 0 ;
1055+ events.clear ();
9241056 }
9251057 }
9261058
@@ -1026,3 +1158,33 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
10261158 tensor->data = dst;
10271159 tensor->backend = GGML_BACKEND_CL;
10281160}
1161+
1162+ void ggml_cl_load_data (const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1163+ cl_int err;
1164+ FILE * fp = fopen (fname, " rb" );
1165+
1166+ const size_t size = ggml_nbytes (tensor);
1167+
1168+ cl_mem dst;
1169+ CL_CHECK ((dst = clCreateBuffer (context, CL_MEM_READ_ONLY, size, nullptr , &err), err));
1170+ void * buf_host = malloc (size);
1171+
1172+ #ifdef _WIN32
1173+ int ret = _fseeki64 (fp, (__int64) offset, SEEK_SET);
1174+ #else
1175+ int ret = fseek (fp, (long ) offset, SEEK_SET);
1176+ #endif
1177+ GGML_ASSERT (ret == 0 ); // same
1178+
1179+ size_t ret2 = fread (buf_host, size, 1 , fp);
1180+ if (ret2 != 1 ) {
1181+ fprintf (stderr, " unexpectedly reached end of file" );
1182+ exit (1 );
1183+ }
1184+
1185+ clEnqueueWriteBuffer (queue, dst, CL_TRUE, 0 , size, buf_host, 0 , nullptr , nullptr );
1186+
1187+ tensor->data = dst;
1188+ free (buf_host);
1189+ fclose (fp);
1190+ }
0 commit comments