From 08d3b401906d46ab22a9053f17eb7b3c02a1c964 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Thu, 14 Mar 2024 04:42:29 -0700
Subject: [PATCH 01/23] iq2_s

---
 ggml-sycl.cpp | 206 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 204 insertions(+), 2 deletions(-)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 9f6506383cc0d..d19e290030f01 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1,3 +1,4 @@
+
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
@@ -4732,6 +4733,36 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
 
 }
 
+template<typename dst_t>
+static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                     const sycl::nd_item<3> &item_ct1,
+                                     const uint64_t *iq2s_grid,
+                                     const uint8_t *ksigns_iq2xs,
+                                     const uint8_t *kmask_iq2xs) {
+    const int i = item_ct1.get_group(2);
+    const block_iq2_s * x = (const block_iq1_s  *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * qs = x[i].qs + 8*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
+    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
+    const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+                                    
+
 /*
 DPCT1110:4: The total declared local variable size in device function
 dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -7648,6 +7679,64 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
 #endif
 }
 
+static __dpct_inline__ float
+vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
+                    const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                    const uint64_t *iq2s_grid, const uint64_t *ksigns64) {
+#if QK_K == 256
+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t * q8 = bq8_1[ib32].qs;
+    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
+    int sumi1 = 0;
+    for (int l = 0; l < 2; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
+        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs0, signs0, std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs1, signs1, std::minus<>());
+        sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
+        sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
+        q8 += 8;
+    }
+    int sumi2 = 0;
+    for (int l = 2; l < 4; ++l) {
+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
+        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
+        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
+            ((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>());
+        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
+            grid[0] ^ signs0, signs0, std::minus<>());
+        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
+            grid[1] ^ signs1, signs1, std::minus<>());
+        sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
+        sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
+        q8 += 8;
+    }
+    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
+#else
+    (void) ksigns64;
+    assert(false);
+    return 0.f;
+#endif
+#else
+    (void) ksigns64;
+    assert(false);
+    return 0.f;
+#endif
+}
+
+
+
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
           int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
           vec_dot_q_mul_mat_sycl_t vec_dot>
@@ -8504,6 +8593,53 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void *
     }
 }
 
+
+template <int qk, int qi, typename block_q_t, int vdr>
+static void mul_mat_vec_q_iq2_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+                          const sycl::nd_item<3> &item_ct1,
+                          const uint64_t *iq2s_grid_ptr, const uint64_t *ksigns64_ptr ) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
+         i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
+
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs, iq2s_grid_ptr, ksigns64_ptr);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
                                    const sycl::nd_item<3> &item_ct1) {
@@ -10247,6 +10383,36 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
     }
 }
 
+template <typename dst_t>
+static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
+                                        dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        iq2s_grid.init(*stream);
+        ksigns_iq2xs.init(*stream);
+        kmask_iq2xs.init(*stream);
+
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->submit([&](sycl::handler &cgh) {
+            auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr();
+            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
+            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_s(
+                                     vx, y, item_ct1, iq2s_grid_ptr_ct1,
+                                     ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1);
+                             });
+        });
+    }
+}
+
+
 template <typename src_t, typename dst_t>
 static void convert_unary_sycl(const void *__restrict__ vx,
                                dst_t *__restrict__ y, const int k,
@@ -10301,6 +10467,8 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try {
             return dequantize_row_iq3_s_sycl;
         case GGML_TYPE_IQ1_S:
             return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
         case GGML_TYPE_F32:
             return convert_unary_sycl<float>;
         default:
@@ -10345,6 +10513,8 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
             return dequantize_row_iq3_s_sycl;
         case GGML_TYPE_IQ1_S:
             return dequantize_row_iq1_s_sycl;
+        case GGML_TYPE_IQ2_S:
+            return dequantize_row_iq2_s_sycl;
         case GGML_TYPE_F16:
             return convert_unary_sycl<sycl::half>;
         default:
@@ -10990,6 +11160,35 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
     }
 }
 
+static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        iq2s_grid.init(*stream);
+        ksigns64.init(*stream);
+
+        stream->submit([&](sycl::handler &cgh) {
+            auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr();
+            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1,
+                            iq2s_grid_ptr_ct1, ksigns64_ptr_ct1);
+                    });
+        });
+    }
+}
+
+
 static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
@@ -13738,6 +13937,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ2_S:
         case GGML_TYPE_IQ3_XXS:
             return max_compute_capability >= VER_GEN9 ? 128 : 64;
         case GGML_TYPE_IQ3_S:
@@ -13808,6 +14008,9 @@ inline void ggml_sycl_op_mul_mat_vec_q(
         case GGML_TYPE_IQ1_S:
             mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
+         case GGML_TYPE_IQ2_S:
+            mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
         default:
             GGML_ASSERT(false);
             break;
@@ -17153,8 +17356,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                     return false;
                 }
                 ggml_type a_type = a->type;
-                if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S ||
-                    a_type == GGML_TYPE_IQ4_XS) {
+                if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) {
                     return false;
                 }
                 return true;

From 9b030b98a64d5aa48b9fef406e6184f0a552be95 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Thu, 14 Mar 2024 04:47:03 -0700
Subject: [PATCH 02/23] iq2_s

---
 ggml-sycl.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index d19e290030f01..00ce8e7fae419 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1,4 +1,3 @@
-
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation

From 81b6139f4c526d12f52962f042e8cb6d404d76bf Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Thu, 14 Mar 2024 04:57:58 -0700
Subject: [PATCH 03/23] bug fix

---
 ggml-sycl.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 00ce8e7fae419..fdef752bcbb33 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -7727,15 +7727,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
     assert(false);
     return 0.f;
 #endif
-#else
-    (void) ksigns64;
-    assert(false);
-    return 0.f;
-#endif
 }
 
-
-
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
           int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
           vec_dot_q_mul_mat_sycl_t vec_dot>

From 0af3ed733fa6b10a5be5c6d28f98dc01e740ab6e Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Thu, 14 Mar 2024 05:06:22 -0700
Subject: [PATCH 04/23] bug fix

---
 ggml-sycl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index fdef752bcbb33..31b3a3e15921b 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4739,7 +4739,7 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr
                                      const uint8_t *ksigns_iq2xs,
                                      const uint8_t *kmask_iq2xs) {
     const int i = item_ct1.get_group(2);
-    const block_iq2_s * x = (const block_iq1_s  *) vx;
+    const block_iq2_s * x = (const block_iq2_s  *) vx;
 
     const int tid = item_ct1.get_local_id(2);
 #if QK_K == 256
@@ -4747,8 +4747,8 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
     const uint8_t  * qs = x[i].qs + 8*ib;
-    const uint8_t  * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
-    const uint8_t  * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
+    const uint8_t  * grid1 = (const uint8_t *)(iq2s_grid + qs[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq2s_grid + qs[2*il+1]);
     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
     for (int j = 0; j < 4; ++j) {
@@ -7686,7 +7686,7 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
     const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
 
     const int ib32 = iqs;
-    const uint8_t * q8 = bq8_1[ib32].qs;
+    const int8_t * q8 = bq8_1[ib32].qs;
     const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
     const uint8_t ls1 = bq2->scales[ib32] & 0xf;
     const uint8_t ls2 = bq2->scales[ib32] >>  4;

From 87e5c86686e3c87ddf26829f863a4b5f9ed7ab83 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Fri, 15 Mar 2024 00:33:35 -0700
Subject: [PATCH 05/23] allow iq quant

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 31b3a3e15921b..15aeb993f9df5 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15498,7 +15498,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #ifdef GGML_SYCL_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
 #else
-            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
+            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
 #endif // GGML_SYCL_FORCE_DMMV
 
             if (use_mul_mat_vec_q) {

From 15617b870c66f7dc0cadbadaff1988184007e25e Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Mon, 18 Mar 2024 19:43:09 +0530
Subject: [PATCH 06/23] format

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 71fa75b4223eb..b8a336503d252 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4867,7 +4867,7 @@ static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restr
 #endif
 
 }
-                                    
+
 
 /*
 DPCT1110:4: The total declared local variable size in device function

From 32589a642fde0f3e94c80a133a4c9af758a20b00 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Mon, 18 Mar 2024 22:25:19 +0530
Subject: [PATCH 07/23] supress assert

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index b8a336503d252..31b7b5354d3e9 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -14081,7 +14081,7 @@ inline void ggml_sycl_op_mul_mat_vec_q(
     const int64_t src1_ncols, const int64_t src1_padded_row_size,
     const dpct::queue_ptr &stream) {
 
-    GGML_ASSERT(ggml_nrows(src1) == 1);
+    //GGML_ASSERT(ggml_nrows(src1) == 1);
 
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;

From a553def52e3f35cbad5c7c6c093f7d7b8e829823 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Mon, 18 Mar 2024 22:11:19 -0700
Subject: [PATCH 08/23] refactor logic

---
 ggml-sycl.cpp | 89 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 31b7b5354d3e9..837c71f9f1138 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -4694,6 +4694,24 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
 #endif
 }
 
+inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return true;
+        default:
+            return false;
+    }
+}
+
 template<typename dst_t>
 static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                      const sycl::nd_item<3> &item_ct1,
@@ -14082,6 +14100,7 @@ inline void ggml_sycl_op_mul_mat_vec_q(
     const dpct::queue_ptr &stream) {
 
     //GGML_ASSERT(ggml_nrows(src1) == 1);
+    //GGML_ASSERT(ne10 % QK8_1 == 0);
 
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
@@ -15594,7 +15613,18 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
             min_compute_capability = g_device_caps[i].cc;
         }
     }
-
+    
+    
+    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) 
+                                            && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+                                            && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+    bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
+                             && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+                             && src1->ne[1] <= XMX_MAX_BATCH_SIZE;
+    bool use_mul_mat_q =   ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 
+                           && dst->type == GGML_TYPE_F32;
+                             
+                             
 #ifdef SYCL_USE_XMX
     const bool use_xmx = true;
 #else
@@ -15609,6 +15639,11 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
+#ifdef GGML_SYCL_FORCE_DMMV
+             use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+#endif // GGML_SYCL_FORCE_DMMV
+
+
     if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
@@ -15621,43 +15656,31 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         // KQ + KQV multi-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
         ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
-#ifdef GGML_SYCL_FORCE_DMMV
-            const bool use_mul_mat_vec_q = false;
-#else
-            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
-#endif // GGML_SYCL_FORCE_DMMV
-
-            if (use_mul_mat_vec_q) {
-                // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
-            } else {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
-            }
-        } else {
-            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
-
-            if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
+    } else if (use_dequantize_mul_mat_vec){
+        // use ggml_sycl_op_dequantize_mul_mat_vec
+        //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n"");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
+    } else if (use_mul_mat_vec_q){
+        // use ggml_sycl_op_mul_mat_vec_q 
+        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
+    
+    } else if (use_mul_mat_q){
+    
+        if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
             }
 
-            if (use_mul_mat_q) {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
-            } else {
-                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
-                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
-            }
+        if (use_mul_mat_q) {
+            // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
+            ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
+        } else {
+            // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
+            ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
         }
     } else {
-        GGML_ASSERT(false);
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     }
 }
 

From 9fa92aa789ebde163abfb3fa401c8464ed125461 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 19 Mar 2024 00:13:08 -0700
Subject: [PATCH 09/23] fix build

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 837c71f9f1138..b3bfda0e4a2de 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15617,7 +15617,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) 
                                             && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-                                            && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
+                                            && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
                              && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                              && src1->ne[1] <= XMX_MAX_BATCH_SIZE;

From 7466e4edefe792b18ef52f045914e06c6849dc3a Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 19 Mar 2024 03:12:36 -0700
Subject: [PATCH 10/23] add quants

---
 ggml-sycl.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index b3bfda0e4a2de..8dd5f050f4bed 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17505,8 +17505,12 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                     return false;
                 }
                 ggml_type a_type = a->type;
-                if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) {
-                    return false;
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
+                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
+                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                        return false;
+                    }
                 }
                 return true;
             } break;

From f5fed7404e7517b4ada3061c6815824b76f92000 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 19 Mar 2024 03:19:51 -0700
Subject: [PATCH 11/23] add quant types from cuda

---
 ggml-sycl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 8dd5f050f4bed..a61299d35416d 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17505,6 +17505,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                     return false;
                 }
                 ggml_type a_type = a->type;
+                
                 if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                     a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                     a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {

From 7ea2e1574fa3c026734c98384d8cc099189160b5 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Mon, 25 Mar 2024 04:48:28 -0700
Subject: [PATCH 12/23] fix format

---
 ggml-sycl.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 4fbc9795bf0b2..97c30ae06e6e5 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15355,18 +15355,16 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
             min_compute_capability = g_device_caps[i].cc;
         }
     }
-    
-    
+
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) 
-                                            && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-                                            && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+                                      && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+                                      && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
                              && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                              && src1->ne[1] <= XMX_MAX_BATCH_SIZE;
-    bool use_mul_mat_q =   ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32 
+    bool use_mul_mat_q =   ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32
                            && dst->type == GGML_TYPE_F32;
-                             
-                             
+
 #ifdef SYCL_USE_XMX
     const bool use_xmx = true;
 #else
@@ -15405,13 +15403,12 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     } else if (use_mul_mat_vec_q){
         // use ggml_sycl_op_mul_mat_vec_q 
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
-    
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);    
     } else if (use_mul_mat_q){
-    
+
         if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
-            }
+        }
 
         if (use_mul_mat_q) {
             // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");

From 551f5a0378d541c97b54330b359001feef49043f Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Mon, 25 Mar 2024 04:54:43 -0700
Subject: [PATCH 13/23] fix format

---
 ggml-sycl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 97c30ae06e6e5..3f1931bf63f33 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15356,7 +15356,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) 
+    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
                                       && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                                       && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
@@ -15401,9 +15401,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n"");
         ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
     } else if (use_mul_mat_vec_q){
-        // use ggml_sycl_op_mul_mat_vec_q 
+        // use ggml_sycl_op_mul_mat_vec_q
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);    
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
     } else if (use_mul_mat_q){
 
         if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
@@ -17246,7 +17246,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                     return false;
                 }
                 ggml_type a_type = a->type;
-                
+
                 if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                     a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                     a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {

From ada101ef2a6847313254a358434ddc41946d2e86 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 00:01:05 -0700
Subject: [PATCH 14/23] explicit add conditions fp32

---
 ggml-sycl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 3f1931bf63f33..1d244003b18d3 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15417,9 +15417,11 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
             // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
             ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
         }
-    } else {
+    } else if (src0->type == GGML_TYPE_F32){
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
         ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+    } else {
+        GGML_ASSERT(false);
     }
 }
 

From d4b182ccd5f5ef2b4a3468e880b99de650bac42e Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 09:05:02 -0700
Subject: [PATCH 15/23] refine condition

---
 ggml-sycl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 1d244003b18d3..425feb8b0f515 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15360,10 +15360,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                                       && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                                       && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
-                             && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-                             && src1->ne[1] <= XMX_MAX_BATCH_SIZE;
-    bool use_mul_mat_q =   ggml_sycl_supports_mmq(src0->type) && src1->type == GGML_TYPE_F32
-                           && dst->type == GGML_TYPE_F32;
+                             && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+    bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
+                         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
+                         && ggml_sycl_supports_mmq(src0->type);
 
 #ifdef SYCL_USE_XMX
     const bool use_xmx = true;

From e9377baf7ab0feeac4a2c44ca206fa85576d430c Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 09:19:06 -0700
Subject: [PATCH 16/23] add conditions

---
 ggml-sycl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 425feb8b0f515..fd9250ade186d 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15359,8 +15359,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
                                       && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                                       && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
-                             && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+    bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && (ggml_is_quantized(src0->type) 
+                             || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 
+                             && dst->type == GGML_TYPE_F32;
     bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
                          && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
                          && ggml_sycl_supports_mmq(src0->type);

From 69aaa3d78be65b7a924122a5fe2ddaf8ad626157 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 19:39:48 -0700
Subject: [PATCH 17/23] revert logic

---
 ggml-sycl.cpp | 68 ++++++++++++++++++++++++---------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index fd9250ade186d..061f99b2771d5 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15356,16 +15356,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
-                                      && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-                                      && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && (ggml_is_quantized(src0->type) 
-                             || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 
-                             && dst->type == GGML_TYPE_F32;
-    bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
-                         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-                         && ggml_sycl_supports_mmq(src0->type);
-
 #ifdef SYCL_USE_XMX
     const bool use_xmx = true;
 #else
@@ -15380,11 +15370,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-#ifdef GGML_SYCL_FORCE_DMMV
-             use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
-#endif // GGML_SYCL_FORCE_DMMV
-
-
     if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
@@ -15397,30 +15382,41 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         // KQ + KQV multi-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
         ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
-    } else if (use_dequantize_mul_mat_vec){
-        // use ggml_sycl_op_dequantize_mul_mat_vec
-        //GGML_SYCL_DEBUG(""ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n"");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
-    } else if (use_mul_mat_vec_q){
-        // use ggml_sycl_op_mul_mat_vec_q
-        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
-    } else if (use_mul_mat_q){
-
-        if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-        }
+    }else if (src0->type == GGML_TYPE_F32) {
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
+#ifdef GGML_SYCL_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
+#endif // GGML_SYCL_FORCE_DMMV
 
-        if (use_mul_mat_q) {
-            // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
-            ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
+            if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_nrows(src1) > 1
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
+            } else {
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
+            }
         } else {
-            // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
-            ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
+
+            if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
+                use_mul_mat_q = false;
+            }
+
+            if (use_mul_mat_q) {
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
+            } else {
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+            }
         }
-    } else if (src0->type == GGML_TYPE_F32){
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     } else {
         GGML_ASSERT(false);
     }

From 19772fab9c3a2d26b2f3bd077378910e6e47e466 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 20:36:02 -0700
Subject: [PATCH 18/23] add condition for iq2s

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 061f99b2771d5..0cfcd4e6bed64 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15382,7 +15382,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         // KQ + KQV multi-batch
         // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
         ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
-    }else if (src0->type == GGML_TYPE_F32) {
+    } else if (src0->type == GGML_TYPE_F32) {
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
         ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {

From 871a135bd3ff1850d7c733e54bdc9de82b0270bc Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 26 Mar 2024 20:42:51 -0700
Subject: [PATCH 19/23] iq2s and other quant logic add

---
 ggml-sycl.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 0cfcd4e6bed64..99cc5ef86abdf 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -13972,6 +13972,24 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
         case GGML_TYPE_Q6_K:
             dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
+        case GGML_TYPE_IQ2_XXS:
+            mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ2_XS:
+            mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ3_S:
+            mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_IQ1_S:
+            mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+         case GGML_TYPE_IQ2_S:
+            mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
         case GGML_TYPE_F16:
             convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;

From ff4ace5e488756fb796205c9dc26d860143ec1f9 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Wed, 27 Mar 2024 12:36:57 +0530
Subject: [PATCH 20/23] disable to check perf

---
 ggml-sycl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 99cc5ef86abdf..7fbb9bebe5c32 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17267,9 +17267,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                 if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                     a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                     a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                    //if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                         return false;
-                    }
+                    //}
                 }
                 return true;
             } break;

From 4e6df37d12911c6db50c1215c2e4ca5fd9296509 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Wed, 27 Mar 2024 13:48:40 +0530
Subject: [PATCH 21/23] enable with rebase

---
 ggml-sycl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index d66b7b89a8055..cf38c28218339 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17267,9 +17267,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                 if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
                     a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
                     a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-                    //if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                         return false;
-                    //}
+                    }
                 }
                 return true;
             } break;

From 619ce80144c8196ad93c547306f01de8d26f685a Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Thu, 28 Mar 2024 11:06:29 +0530
Subject: [PATCH 22/23] Update ggml-sycl.cpp

Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
---
 ggml-sycl.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index cf38c28218339..291594459eadf 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17263,11 +17263,15 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
                     return false;
                 }
                 ggml_type a_type = a->type;
+                // No support in mmvq or other methods
+                if (a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS)
+                    return false;
 
+                // Support in mmvq
                 if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
-                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
-                    a_type == GGML_TYPE_IQ2_S   || a_type == GGML_TYPE_IQ4_XS) {
-                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ3_S  || a_type == GGML_TYPE_IQ2_S   ) {
+                    // condition for using mmvq
+                    if (b->ne[1] > 1 || a->ne[0] % GGML_SYCL_DMMV_X != 0) {
                         return false;
                     }
                 }

From 935eabd917bb480f71072de67d150e0c0ff72a63 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Wed, 27 Mar 2024 22:56:15 -0700
Subject: [PATCH 23/23] add condition

---
 ggml-sycl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 291594459eadf..c2fb20c17e910 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15421,7 +15421,8 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
             }
         } else {
-            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
+            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type)
+                                 && ggml_sycl_supports_mmq(src0->type);
 
             if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;