Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ option(GGML_RPC "ggml: use RPC"
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ if (GGML_SYCL_GRAPH)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
endif()

if (GGML_SYCL_HOST_MEM_FALLBACK)
message(STATUS "find GGML_SYCL_HOST_MEM_FALLBACK")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_HOST_MEM_FALLBACK)
endif()

if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
Expand Down
33 changes: 31 additions & 2 deletions ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,25 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int

}

template <typename dst_t>
static void dequantize_row_q8_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {

dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});

int constexpr WARP_K = WARP_SIZE * QK8_0;
const int n_warp = (k + WARP_K - 1) / WARP_K;
GGML_ASSERT(k % QK8_0 == 0);
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
sycl::range<3>(1, 1, WARP_SIZE),
sycl::range<3>(1, 1, WARP_SIZE)),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
dequantize_block_q8_0_reorder(vx, y, k, item_ct1);
});

}

template <typename dst_t>
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
Expand Down Expand Up @@ -614,7 +633,12 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_Q5_1:
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q8_0_sycl_reorder;
} else {
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
}
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
Expand Down Expand Up @@ -683,7 +707,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q5_1:
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q8_0_sycl_reorder;
} else {
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
}
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-sycl/dequantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,34 @@ static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t *

}

// Dequantize Q8_0 from reorder layout: [all qs (k bytes)][all d values]
// Each thread handles one block of QK8_0 elements.
template<typename dst_t>
static void dequantize_block_q8_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t k,
const sycl::nd_item<3> &item_ct1) {

const int64_t i = item_ct1.get_group(2);
const int64_t tid = item_ct1.get_local_id(2);
const int lane_ib = i * WARP_SIZE + tid;

if (lane_ib >= k / QK8_0) {
return;
}

dst_t * y_ptr = yy + lane_ib * QK8_0;

auto qs = (const int8_t*)vx + lane_ib * QK8_0;
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k) + lane_ib;

const float d = float(*s_ptr);

#pragma unroll
for (int l = 0; l < QK8_0; ++l) {
y_ptr[l] = d * qs[l];
}

}

template<typename dst_t>
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
const sycl::nd_item<3> &item_ct1) {
Expand Down
106 changes: 82 additions & 24 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3333,9 +3333,55 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
sycl::free(ptr, *stream);
}

static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
// RAII wrapper for temporary reorder buffers with optional host memory fallback.
// When device allocation fails and GGML_SYCL_HOST_MEM_FALLBACK is enabled,
// falls back to host memory so the reorder kernel can still run (over PCIe).
// Device access to host memory requires Linux kernel 6.8+ (Ubuntu 26.04+).
struct sycl_reorder_temp_buffer {
void * ptr = nullptr;
dpct::queue_ptr stream;

sycl_reorder_temp_buffer(dpct::queue_ptr stream, size_t size) : stream(stream) {
ptr = sycl_ext_malloc_device(stream, size);
#ifdef GGML_SYCL_HOST_MEM_FALLBACK
if (!ptr) {
ptr = sycl::malloc_host(size, *stream);
if (ptr) {
host_fallback = true;
GGML_LOG_WARN("%s: device alloc of %zu bytes failed, using host memory fallback\n", __func__, size);
}
}
#endif
}

~sycl_reorder_temp_buffer() {
if (!ptr) {
return;
}
if (host_fallback) {
sycl::free(ptr, *stream);
} else {
sycl_ext_free(stream, ptr);
}
}

explicit operator bool() const { return ptr != nullptr; }

sycl_reorder_temp_buffer(const sycl_reorder_temp_buffer &) = delete;
sycl_reorder_temp_buffer & operator=(const sycl_reorder_temp_buffer &) = delete;

private:
bool host_fallback = false;
};

static bool reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
Expand Down Expand Up @@ -3364,12 +3410,17 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
return true;
}

static void reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
static bool reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
Expand Down Expand Up @@ -3398,16 +3449,21 @@ static void reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nr
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
return true;
}

static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);

const int nblocks = size / sizeof(block_q4_K);

uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
Expand Down Expand Up @@ -3436,16 +3492,21 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
return true;
}

static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);

const int nblocks = size / sizeof(block_q6_K);

uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);

sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
Expand Down Expand Up @@ -3484,31 +3545,27 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
return true;
}

static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
uint8_t * data_device = (uint8_t *) src0->data;
size_t ncols = src0->ne[0];
size_t nrows = src0->ne[1];
size_t size = ggml_nbytes(src0);

switch (src0->type) {
case GGML_TYPE_Q4_0:
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
break;
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q8_0:
reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
break;
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q4_K:
reorder_qw_q4_k(data_device, size, 0, stream);
break;
return reorder_qw_q4_k(data_device, size, 0, stream);
case GGML_TYPE_Q6_K:
reorder_qw_q6_k(data_device, size, 0, stream);
break;
return reorder_qw_q6_k(data_device, size, 0, stream);
default:
GGML_ABORT("reorder_qw() called with unsupported type");
break;
return false;
}
}

Expand Down Expand Up @@ -3548,8 +3605,9 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
break;
}

reorder_qw(src0, ctx->stream());
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
if (reorder_qw(src0, ctx->stream())) {
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
}
}


Expand Down