|
4 | 4 | #include <atomic> |
5 | 5 | #include <sstream> |
6 | 6 | #include <vector> |
| 7 | +#include <limits> |
7 | 8 |
|
8 | 9 | #define CL_TARGET_OPENCL_VERSION 110 |
9 | 10 | #include <clblast.h> |
@@ -604,21 +605,44 @@ struct cl_buffer { |
604 | 605 | static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; |
605 | 606 | static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; |
606 | 607 |
|
607 | | -static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { |
| 608 | +static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) { |
608 | 609 | scoped_spin_lock lock(g_cl_pool_lock); |
609 | 610 | cl_int err; |
610 | 611 |
|
| 612 | + int best_i = -1; |
| 613 | + size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs |
| 614 | + int worst_i = -1; |
| 615 | + size_t worst_size = 0; //largest unused buffer seen so far |
611 | 616 | for (int i = 0; i < MAX_CL_BUFFERS; ++i) { |
612 | | - cl_buffer& b = g_cl_buffer_pool[i]; |
613 | | - if (b.size > 0 && b.size >= size) { |
614 | | - cl_mem mem = b.mem; |
615 | | - *actual_size = b.size; |
616 | | - b.size = 0; |
617 | | - return mem; |
| 617 | + cl_buffer &b = g_cl_buffer_pool[i]; |
| 618 | + if (b.size > 0 && b.size >= size && b.size < best_size) |
| 619 | + { |
| 620 | + best_i = i; |
| 621 | + best_size = b.size; |
| 622 | + } |
| 623 | + if (b.size > 0 && b.size > worst_size) |
| 624 | + { |
| 625 | + worst_i = i; |
| 626 | + worst_size = b.size; |
618 | 627 | } |
619 | 628 | } |
| 629 | + if(best_i!=-1) //found the smallest buffer that fits our needs |
| 630 | + { |
| 631 | + cl_buffer& b = g_cl_buffer_pool[best_i]; |
| 632 | + cl_mem mem = b.mem; |
| 633 | + *actual_size = b.size; |
| 634 | + b.size = 0; |
| 635 | + return mem; |
| 636 | + } |
| 637 | + if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory |
| 638 | + { |
| 639 | + cl_buffer& b = g_cl_buffer_pool[worst_i]; |
| 640 | + cl_mem mem = b.mem; |
| 641 | + b.size = 0; |
| 642 | + clReleaseMemObject(mem); |
| 643 | + } |
620 | 644 | cl_mem mem; |
621 | | - CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err)); |
| 645 | + CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); |
622 | 646 | *actual_size = size; |
623 | 647 | return mem; |
624 | 648 | } |
@@ -692,9 +716,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, |
692 | 716 | size_t x_size; |
693 | 717 | size_t d_size; |
694 | 718 |
|
695 | | - cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0 |
| 719 | + cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 |
696 | 720 | cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. |
697 | | - cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst |
| 721 | + cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst |
| 722 | + |
698 | 723 |
|
699 | 724 | for (int64_t i03 = 0; i03 < ne03; i03++) { |
700 | 725 | for (int64_t i02 = 0; i02 < ne02; i02++) { |
@@ -792,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr |
792 | 817 | if (src0->backend == GGML_BACKEND_CL) { |
793 | 818 | d_X = (cl_mem) src0->data; |
794 | 819 | } else { |
795 | | - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); |
| 820 | + d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); |
796 | 821 | } |
797 | | - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); |
798 | | - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 822 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); |
| 823 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); |
799 | 824 |
|
800 | 825 | for (int64_t i03 = 0; i03 < ne03; i03++) { |
801 | 826 | for (int64_t i02 = 0; i02 < ne02; i02++) { |
@@ -868,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr |
868 | 893 | if (src0->backend == GGML_BACKEND_CL) { |
869 | 894 | d_X = (cl_mem) src0->data; |
870 | 895 | } else { |
871 | | - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); |
| 896 | + d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); |
872 | 897 | } |
873 | | - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); |
874 | | - cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 898 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size); |
| 899 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size); |
875 | 900 |
|
876 | 901 | bool src1_cont_rows = nb10 == sizeof(float); |
877 | 902 | bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); |
@@ -970,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * |
970 | 995 | size_t q_size; |
971 | 996 | cl_mem d_X; |
972 | 997 | if (!mul_mat_vec) { |
973 | | - d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); |
| 998 | + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); |
974 | 999 | } |
975 | | - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); |
976 | | - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); |
| 1000 | + cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); |
| 1001 | + cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); |
977 | 1002 | cl_mem d_Q; |
978 | 1003 | if (src0->backend == GGML_BACKEND_CPU) { |
979 | | - d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); |
| 1004 | + d_Q = ggml_cl_pool_malloc(q_sz, &q_size); |
980 | 1005 | } |
981 | 1006 |
|
982 | 1007 | cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); |
@@ -1143,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) { |
1143 | 1168 | const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); |
1144 | 1169 |
|
1145 | 1170 | size_t q_size; |
1146 | | - cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); |
| 1171 | + cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); |
1147 | 1172 |
|
1148 | 1173 | // copy tensor to device |
1149 | 1174 | for (int64_t i3 = 0; i3 < ne3; i3++) { |
|
0 commit comments