From 08d3b401906d46ab22a9053f17eb7b3c02a1c964 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 14 Mar 2024 04:42:29 -0700 Subject: [PATCH 01/23] iq2_s --- ggml-sycl.cpp | 206 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 2 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 9f6506383cc0d..d19e290030f01 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -1,3 +1,4 @@ + // // MIT license // Copyright (C) 2024 Intel Corporation @@ -4732,6 +4733,36 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr } +template +static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2s_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq1_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -7648,6 +7679,64 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, #endif } +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2s_grid, const uint64_t *ksigns64) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const uint8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + (void) ksigns64; + assert(false); + return 0.f; +#endif +#else + (void) ksigns64; + assert(false); + return 0.f; +#endif +} + + + template @@ -8504,6 +8593,53 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * } } + +template +static void mul_mat_vec_q_iq2_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2s_grid_ptr, const uint64_t *ksigns64_ptr ) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs, iq2s_grid_ptr, ksigns64_ptr); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + template static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1) { @@ -10247,6 +10383,36 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, } } +template +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + iq2s_grid.init(*stream); + ksigns_iq2xs.init(*stream); + kmask_iq2xs.init(*stream); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); + auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); + auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s( + vx, y, item_ct1, iq2s_grid_ptr_ct1, + ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + }); + }); + } +} + + template static void convert_unary_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, @@ -10301,6 +10467,8 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { return dequantize_row_iq3_s_sycl; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_F32: return convert_unary_sycl; default: @@ -10345,6 +10513,8 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { return dequantize_row_iq3_s_sycl; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_F16: return convert_unary_sycl; default: @@ -10990,6 +11160,35 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, } } +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + iq2s_grid.init(*stream); + ksigns64.init(*stream); + + stream->submit([&](sycl::handler &cgh) { + auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); + auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1, + iq2s_grid_ptr_ct1, ksigns64_ptr_ct1); + }); + }); + } +} + + static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols_x, const int nrows_x, const int ncols_y, @@ -13738,6 +13937,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; case GGML_TYPE_IQ3_S: @@ -13808,6 +14008,9 @@ inline void ggml_sycl_op_mul_mat_vec_q( case GGML_TYPE_IQ1_S: mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -17153,8 +17356,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || - a_type == GGML_TYPE_IQ4_XS) { + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) { return false; } return true; From 9b030b98a64d5aa48b9fef406e6184f0a552be95 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 14 Mar 2024 04:47:03 -0700 Subject: [PATCH 02/23] iq2_s --- ggml-sycl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index d19e290030f01..00ce8e7fae419 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -1,4 +1,3 @@ - // // MIT license // Copyright (C) 2024 Intel Corporation From 81b6139f4c526d12f52962f042e8cb6d404d76bf Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 14 Mar 2024 04:57:58 -0700 Subject: [PATCH 03/23] bug fix --- ggml-sycl.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 00ce8e7fae419..fdef752bcbb33 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -7727,15 +7727,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, assert(false); return 0.f; #endif -#else - (void) ksigns64; - assert(false); - return 0.f; -#endif } - - template From 0af3ed733fa6b10a5be5c6d28f98dc01e740ab6e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Thu, 14 Mar 2024 05:06:22 -0700 Subject: [PATCH 04/23] bug fix --- ggml-sycl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index fdef752bcbb33..31b3a3e15921b 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4739,7 +4739,7 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr const uint8_t *ksigns_iq2xs, const uint8_t *kmask_iq2xs) { const int i = item_ct1.get_group(2); - const block_iq2_s * x = (const block_iq1_s *) vx; + const block_iq2_s * x = (const block_iq2_s *) vx; const int tid = item_ct1.get_local_id(2); #if QK_K == 256 @@ -4747,8 +4747,8 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]); + const uint8_t * grid1 = (const uint8_t *)(iq2s_grid + qs[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2s_grid + qs[2*il+1]); const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; for (int j = 0; j < 4; ++j) { @@ -7686,7 +7686,7 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; - const uint8_t * q8 = bq8_1[ib32].qs; + const int8_t * q8 = bq8_1[ib32].qs; const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; const uint8_t ls1 = bq2->scales[ib32] & 0xf; const uint8_t ls2 = bq2->scales[ib32] >> 4; From 87e5c86686e3c87ddf26829f863a4b5f9ed7ab83 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Fri, 15 Mar 2024 00:33:35 -0700 Subject: [PATCH 05/23] allow iq quant --- ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 31b3a3e15921b..15aeb993f9df5 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15498,7 +15498,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_SYCL_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); #endif // GGML_SYCL_FORCE_DMMV if (use_mul_mat_vec_q) { From 15617b870c66f7dc0cadbadaff1988184007e25e Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:43:09 +0530 Subject: [PATCH 06/23] format --- ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 71fa75b4223eb..b8a336503d252 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4867,7 +4867,7 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr #endif } - + /* DPCT1110:4: The total declared local variable size in device function From 32589a642fde0f3e94c80a133a4c9af758a20b00 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 18 Mar 2024 22:25:19 +0530 Subject: [PATCH 07/23] supress assert --- ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index b8a336503d252..31b7b5354d3e9 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -14081,7 +14081,7 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_ASSERT(ggml_nrows(src1) == 1); + //GGML_ASSERT(ggml_nrows(src1) == 1); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; From a553def52e3f35cbad5c7c6c093f7d7b8e829823 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 18 Mar 2024 22:11:19 -0700 Subject: [PATCH 08/23] refactor logic --- ggml-sycl.cpp | 89 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 33 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 31b7b5354d3e9..837c71f9f1138 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4694,6 +4694,24 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri #endif } +inline bool ggml_sycl_supports_mmq(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return true; + default: + return false; + } +} + template static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, @@ -14082,6 +14100,7 @@ inline void ggml_sycl_op_mul_mat_vec_q( const dpct::queue_ptr &stream) { //GGML_ASSERT(ggml_nrows(src1) == 1); + //GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; @@ -15594,7 +15613,18 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 min_compute_capability = g_device_caps[i].cc; } } - + + + bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src1->ne[1] <= XMX_MAX_BATCH_SIZE; + bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 + && dst->type == GGML_TYPE_F32; + + #ifdef SYCL_USE_XMX const bool use_xmx = true; #else @@ -15609,6 +15639,11 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); +#ifdef GGML_SYCL_FORCE_DMMV + use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; +#endif // GGML_SYCL_FORCE_DMMV + + if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n"); @@ -15621,43 +15656,31 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // KQ + KQV multi-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n"); ggml_sycl_mul_mat_batched_sycl(src0, src1, dst); - } else if (src0->type == GGML_TYPE_F32) { - // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); - } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { - // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n"); - if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) { -#ifdef GGML_SYCL_FORCE_DMMV - const bool use_mul_mat_vec_q = false; -#else - const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); -#endif // GGML_SYCL_FORCE_DMMV - - if (use_mul_mat_vec_q) { - // NOTE: this kernel does not support ggml_nrows(src1) > 1 - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); - } else { - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); - } - } else { - bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); - - if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { + } else if (use_dequantize_mul_mat_vec){ + // use ggml_sycl_op_dequantize_mul_mat_vec + //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n""); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); + } else if (use_mul_mat_vec_q){ + // use ggml_sycl_op_mul_mat_vec_q + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + + } else if (use_mul_mat_q){ + + if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { use_mul_mat_q = false; } - if (use_mul_mat_q) { - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true); - } else { - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); - } + if (use_mul_mat_q) { + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + } else { + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } } else { - GGML_ASSERT(false); + // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } } From 9fa92aa789ebde163abfb3fa401c8464ed125461 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 19 Mar 2024 00:13:08 -0700 Subject: [PATCH 09/23] fix build --- ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 837c71f9f1138..b3bfda0e4a2de 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15617,7 +15617,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= XMX_MAX_BATCH_SIZE; From 7466e4edefe792b18ef52f045914e06c6849dc3a Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 19 Mar 2024 03:12:36 -0700 Subject: [PATCH 10/23] add quants --- ggml-sycl.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index b3bfda0e4a2de..8dd5f050f4bed 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17505,8 +17505,12 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) { - return false; + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || + a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { + if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + return false; + } } return true; } break; From f5fed7404e7517b4ada3061c6815824b76f92000 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 19 Mar 2024 03:19:51 -0700 Subject: [PATCH 11/23] add quant types from cuda --- ggml-sycl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 8dd5f050f4bed..a61299d35416d 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17505,6 +17505,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { From 7ea2e1574fa3c026734c98384d8cc099189160b5 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 25 Mar 2024 04:48:28 -0700 Subject: [PATCH 12/23] fix format --- ggml-sycl.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 4fbc9795bf0b2..97c30ae06e6e5 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15355,18 +15355,16 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 min_compute_capability = g_device_caps[i].cc; } } - - + bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= XMX_MAX_BATCH_SIZE; - bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 + bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; - - + #ifdef SYCL_USE_XMX const bool use_xmx = true; #else @@ -15405,13 +15403,12 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (use_mul_mat_vec_q){ // use ggml_sycl_op_mul_mat_vec_q // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); - + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); } else if (use_mul_mat_q){ - + if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { use_mul_mat_q = false; - } + } if (use_mul_mat_q) { // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n"); From 551f5a0378d541c97b54330b359001feef49043f Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Mon, 25 Mar 2024 04:54:43 -0700 Subject: [PATCH 13/23] fix format --- ggml-sycl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 97c30ae06e6e5..3f1931bf63f33 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15356,7 +15356,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) + bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) @@ -15401,9 +15401,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n""); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); } else if (use_mul_mat_vec_q){ - // use ggml_sycl_op_mul_mat_vec_q + // use ggml_sycl_op_mul_mat_vec_q // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); } else if (use_mul_mat_q){ if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { @@ -17246,7 +17246,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { From ada101ef2a6847313254a358434ddc41946d2e86 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 00:01:05 -0700 Subject: [PATCH 14/23] explicit add conditions fp32 --- ggml-sycl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 3f1931bf63f33..1d244003b18d3 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15417,9 +15417,11 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } - } else { + } else if (src0->type == GGML_TYPE_F32){ // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + } else { + GGML_ASSERT(false); } } From d4b182ccd5f5ef2b4a3468e880b99de650bac42e Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 09:05:02 -0700 Subject: [PATCH 15/23] refine condition --- ggml-sycl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 1d244003b18d3..425feb8b0f515 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15360,10 +15360,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src1->ne[1] <= XMX_MAX_BATCH_SIZE; - bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 - && dst->type == GGML_TYPE_F32; + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && ggml_sycl_supports_mmq(src0->type); #ifdef SYCL_USE_XMX const bool use_xmx = true; From e9377baf7ab0feeac4a2c44ca206fa85576d430c Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 09:19:06 -0700 Subject: [PATCH 16/23] add conditions --- ggml-sycl.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 425feb8b0f515..fd9250ade186d 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15359,8 +15359,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; - bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && (ggml_is_quantized(src0->type) + || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 + && dst->type == GGML_TYPE_F32; bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_sycl_supports_mmq(src0->type); From 69aaa3d78be65b7a924122a5fe2ddaf8ad626157 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 19:39:48 -0700 Subject: [PATCH 17/23] revert logic --- ggml-sycl.cpp | 68 ++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index fd9250ade186d..061f99b2771d5 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15356,16 +15356,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } - bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; - bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && (ggml_is_quantized(src0->type) - || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 - && dst->type == GGML_TYPE_F32; - bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) - && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && ggml_sycl_supports_mmq(src0->type); - #ifdef SYCL_USE_XMX const bool use_xmx = true; #else @@ -15380,11 +15370,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); -#ifdef GGML_SYCL_FORCE_DMMV - use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; -#endif // GGML_SYCL_FORCE_DMMV - - if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n"); @@ -15397,30 +15382,41 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // KQ + KQV multi-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n"); ggml_sycl_mul_mat_batched_sycl(src0, src1, dst); - } else if (use_dequantize_mul_mat_vec){ - // use ggml_sycl_op_dequantize_mul_mat_vec - //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n""); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); - } else if (use_mul_mat_vec_q){ - // use ggml_sycl_op_mul_mat_vec_q - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); - } else if (use_mul_mat_q){ - - if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { - use_mul_mat_q = false; - } + }else if (src0->type == GGML_TYPE_F32) { + // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { + // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n"); + if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) { +#ifdef GGML_SYCL_FORCE_DMMV + const bool use_mul_mat_vec_q = false; +#else + const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); +#endif // GGML_SYCL_FORCE_DMMV - if (use_mul_mat_q) { - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + if (use_mul_mat_vec_q) { + // NOTE: this kernel does not support ggml_nrows(src1) > 1 + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + } else { + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); + } } else { - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); + + if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { + use_mul_mat_q = false; + } + + if (use_mul_mat_q) { + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + } else { + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n"); + ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + } } - } else if (src0->type == GGML_TYPE_F32){ - // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); - ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } else { GGML_ASSERT(false); } From 19772fab9c3a2d26b2f3bd077378910e6e47e466 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 20:36:02 -0700 Subject: [PATCH 18/23] add condition for iq2s --- ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 061f99b2771d5..0cfcd4e6bed64 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15382,7 +15382,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // KQ + KQV multi-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n"); ggml_sycl_mul_mat_batched_sycl(src0, src1, dst); - }else if (src0->type == GGML_TYPE_F32) { + } else if (src0->type == GGML_TYPE_F32) { // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { From 871a135bd3ff1850d7c733e54bdc9de82b0270bc Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Tue, 26 Mar 2024 20:42:51 -0700 Subject: [PATCH 19/23] iq2s and other quant logic add --- ggml-sycl.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 0cfcd4e6bed64..99cc5ef86abdf 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -13972,6 +13972,24 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( case GGML_TYPE_Q6_K: dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_F16: convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; From ff4ace5e488756fb796205c9dc26d860143ec1f9 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 27 Mar 2024 12:36:57 +0530 Subject: [PATCH 20/23] disable to check perf --- ggml-sycl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 99cc5ef86abdf..7fbb9bebe5c32 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17267,9 +17267,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { - if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + //if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; - } + //} } return true; } break; From 4e6df37d12911c6db50c1215c2e4ca5fd9296509 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:48:40 +0530 Subject: [PATCH 21/23] enable with rebase --- ggml-sycl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index d66b7b89a8055..cf38c28218339 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17267,9 +17267,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { - //if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; - //} + } } return true; } break; From 619ce80144c8196ad93c547306f01de8d26f685a Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Thu, 28 Mar 2024 11:06:29 +0530 Subject: [PATCH 22/23] Update ggml-sycl.cpp Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --- ggml-sycl.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cf38c28218339..291594459eadf 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17263,11 +17263,15 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; + // No support in mmvq or other methods + if (a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) + return false; + // Support in mmvq if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { - if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S ) { + // condition for using mmvq + if (b->ne[1] > 1 || a->ne[0] % GGML_SYCL_DMMV_X != 0) { return false; } } From 935eabd917bb480f71072de67d150e0c0ff72a63 Mon Sep 17 00:00:00 2001 From: abhilash1910 Date: Wed, 27 Mar 2024 22:56:15 -0700 Subject: [PATCH 23/23] add condition --- ggml-sycl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 291594459eadf..c2fb20c17e910 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15421,7 +15421,8 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); } } else { - bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); + bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) + && ggml_sycl_supports_mmq(src0->type); if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { use_mul_mat_q = false;