Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
#endif
}

template <typename dst_t>
static void dequantize_row_q3_K_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int64_t nb = k / QK_K;

dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_q3_K_reorder(vx, y, item_ct1, nb);
});
}

template <typename dst_t>
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
Expand Down Expand Up @@ -652,7 +665,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q3_K_sycl_reorder;
} else {
return dequantize_row_q3_K_sycl;
}
case GGML_TYPE_Q4_K:
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q4_K_sycl_reorder;
Expand Down Expand Up @@ -730,7 +747,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q3_K_sycl_reorder;
} else {
return dequantize_row_q3_K_sycl;
}
case GGML_TYPE_Q4_K:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
Expand Down
57 changes: 57 additions & 0 deletions ggml/src/ggml-sycl/dequantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,63 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri

}

template<typename dst_t>
static void dequantize_block_q3_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
#if QK_K == 256
const int64_t i = item_ct1.get_group(2);
if (i >= n_blocks) {
return;
}

const uint8_t * base = static_cast<const uint8_t *>(vx);
const size_t qs_offset = i * (QK_K / 4);
const size_t hmask_offset = n_blocks * (QK_K / 4) + i * (QK_K / 8);
const size_t scales_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + i * 12;
const size_t d_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + n_blocks * 12 +
i * sizeof(ggml_half);

const uint8_t * qs = base + qs_offset;
const uint8_t * hmask = base + hmask_offset;
const uint8_t * scales = base + scales_offset;
const float d_all = static_cast<float>(*reinterpret_cast<const ggml_half *>(base + d_offset));

const int64_t r = item_ct1.get_local_id(2) / 4;
const int64_t tid = r / 2;
const int64_t is0 = r % 2;
const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
const int64_t n = tid / 4;
const int64_t j = tid - 4 * n;
const int64_t is = 8 * n + 2 * j + is0;
const int shift = 2 * j;
uint8_t m = 1 << (4 * n + j);

uint8_t us = is < 4
? (scales[is - 0] & 0xF) | (((scales[is + 8] >> 0) & 3) << 4)
: is < 8
? (scales[is - 0] & 0xF) | (((scales[is + 4] >> 2) & 3) << 4)
: is < 12
? (scales[is - 8] >> 4) | (((scales[is + 0] >> 4) & 3) << 4)
: (scales[is - 8] >> 4) | (((scales[is - 4] >> 6) & 3) << 4);

const float dl = d_all * (us - 32);

dst_t * y = yy + i * QK_K + 128 * n + 32 * j;
const uint8_t * q = qs + 32 * n;
const uint8_t * hm = hmask;

for (int l = l0; l < l0 + 4; ++l) {
y[l] = dl * ((int8_t) ((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
}
#else
GGML_UNUSED(vx);
GGML_UNUSED(yy);
GGML_UNUSED(item_ct1);
GGML_UNUSED(n_blocks);
GGML_ABORT("Q3_K reorder dequantize not supported for QK_K != 256");
#endif
}

#if QK_K == 256
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) {
Expand Down
120 changes: 119 additions & 1 deletion ggml/src/ggml-sycl/dmmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,103 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
}
}

static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx,
const float *__restrict__ yy,
float *__restrict__ dst,
const int ncols, int nrows,
const sycl::nd_item<3> &item_ct1) {

const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
if (row > nrows) return;

const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;

// SOA base pointers for the reordered layout:
// [qs: nb * (QK_K/4)] [hmask: nb * (QK_K/8)] [scales: nb * 12] [d: nb * sizeof(half)]
const int nb = nrows * num_blocks_per_row;
const uint8_t * qs_base = (const uint8_t *)vx;
const uint8_t * hmask_base = qs_base + (size_t)nb * (QK_K / 4);
const uint8_t * scales_base = hmask_base + (size_t)nb * (QK_K / 8);
const sycl::half * d_base = (const sycl::half *)(scales_base + (size_t)nb * 12);

float tmp = 0; // partial sum for thread in warp

#if QK_K == 256

const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;

const int tid =
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix =
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1

const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0....15 or 0...7

const uint8_t m = 1 << (4*im);

const int l0 = n*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int y_offset = 128*im + l0;

uint16_t utmp[4];
const int8_t * s = (const int8_t *)utmp;

const uint16_t s_shift = 4*im;

for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const int bi = ib0 + i;

const float * y = yy + i * QK_K + y_offset;
const uint8_t * q = qs_base + bi * (QK_K / 4) + q_offset;
const uint8_t * h = hmask_base + bi * (QK_K / 8) + l0;

const uint16_t * a = (const uint16_t *)(scales_base + bi * 12);
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);

const float d = d_base[bi];

float sum = 0;
for (int l = 0; l < n; ++l) {
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
}
tmp += d * sum;
}
#else
GGML_UNUSED(vx);
GGML_UNUSED(yy);
GGML_UNUSED(ncols);
GGML_UNUSED(item_ct1);
GGML_ABORT("Q3_K reorder DMMV not supported for QK_K != 256");
#endif

// sum up partial sums and write back result
#pragma unroll
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}

if (item_ct1.get_local_id(2) == 0) {
dst[row] = tmp;
}
}

/*
DPCT1110:6: The total declared local variable size in device function
dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
Expand Down Expand Up @@ -1440,6 +1537,22 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
});
}

static void dequantize_mul_mat_vec_q3_K_sycl_reorder(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
dequantize_mul_mat_vec_q3_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
});
}

static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
Expand Down Expand Up @@ -1581,7 +1694,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K:
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q3_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_K:
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
Expand Down
52 changes: 52 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3549,6 +3549,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
Expand All @@ -3572,6 +3573,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
Expand Down Expand Up @@ -3791,6 +3793,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
return true;
}

static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);

const int nblocks = size / sizeof(block_q3_K);

sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}

auto * qs_ptr = data_device;
auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);

auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q3_K * x = (const block_q3_K *) tmp_buf;
const int ib = i;

for (int j = 0; j < QK_K / 4; ++j) {
qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
}

for (int j = 0; j < QK_K / 8; ++j) {
hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
}

for (int j = 0; j < 12; ++j) {
scales_ptr[ib * 12 + j] = x[ib].scales[j];
}

d_ptr[ib] = x[ib].d;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
return true;
}

static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
Expand Down Expand Up @@ -3903,6 +3953,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q8_0:
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q3_K:
return reorder_qw_q3_k(data_device, size, 0, stream);
case GGML_TYPE_Q4_K:
return reorder_qw_q4_k(data_device, size, 0, stream);
case GGML_TYPE_Q5_K:
Expand Down
30 changes: 29 additions & 1 deletion ggml/src/ggml-sycl/mmvq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,26 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
}
}

static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);

// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
constexpr size_t num_subgroups = WARP_SIZE;
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;

const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);

stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q3_K>>(vx, vy, dst, ncols, nrows,
nd_item);
});
});
}

static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
Expand Down Expand Up @@ -1153,7 +1173,15 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n");
reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff,
stream);
} else {
GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl\n");
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_K:
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
Expand Down
Loading
Loading