From 9c33b2cfde66dba952977b180fbbbb7db49ced5d Mon Sep 17 00:00:00 2001 From: PMZFX Date: Tue, 7 Apr 2026 21:05:33 -0400 Subject: [PATCH 01/12] SYCL: fix multi-GPU system RAM exhaustion by using Level Zero allocations Replace sycl::malloc_device with zeMemAllocDevice for GPU memory allocation in the SYCL backend. sycl::malloc_device triggers the xe kernel driver's DMA-buf/TTM path which mirrors every VRAM allocation 1:1 in system RAM. zeMemAllocDevice uses the SVM/P2P path with no host staging. On a dual Intel Arc Pro B70 system (64GB VRAM, 64GB RAM), a 15.6 GiB model consumed 60 GiB of system RAM via sycl::malloc_device, causing OOM crashes. With zeMemAllocDevice, the same workload uses ~6.7 GiB of system RAM with no performance regression. All Level Zero calls include automatic fallback to the original SYCL allocation path if Level Zero interop is unavailable. --- ggml/src/ggml-sycl/CMakeLists.txt | 11 +++++ ggml/src/ggml-sycl/common.cpp | 16 ++++++- ggml/src/ggml-sycl/dpct/helper.hpp | 10 ++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 73 ++++++++++++++++++++++-------- 4 files changed, 89 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 8e589fa238d..c4dec5d7653 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -93,6 +93,17 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") +# Link against Level Zero loader for direct device memory allocation. +# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging +# in the xe kernel driver during multi-GPU inference. +find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ENV LD_LIBRARY_PATH) +if(ZE_LOADER_LIB) + target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) + message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") +else() + message(WARNING "Level Zero loader (ze_loader) not found, multi-GPU may use excessive system RAM") +endif() + # Link against oneDNN set(GGML_SYCL_DNNL 0) if(GGML_SYCL_DNN) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 05fd5ef46c7..7d5caa75623 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -11,6 +11,8 @@ // #include "common.hpp" +#include +#include #include "ggml-backend-impl.h" #include "ggml-impl.h" @@ -75,8 +77,18 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str } if (extra->data_device[i] != nullptr && streams.size()>0) { ggml_sycl_set_device(i); - SYCL_CHECK( - CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); + bool freed = false; + try { + auto ze_ctx = sycl::get_native( + streams[i]->get_context()); + if (zeMemFree(ze_ctx, extra->data_device[i]) == ZE_RESULT_SUCCESS) { + freed = true; + } + } catch (...) {} + if (!freed) { + SYCL_CHECK( + CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); + } } } delete extra; diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 791d3cac52e..055cb309735 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -15,6 +15,8 @@ #include #include +#include +#include #include #include @@ -1307,6 +1309,14 @@ namespace dpct static inline void *dpct_malloc(size_t size, sycl::queue &q) { + try { + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; + void *ptr = nullptr; + if (zeMemAllocDevice(ze_ctx, &desc, size, 64, ze_dev, &ptr) == ZE_RESULT_SUCCESS && ptr) + return ptr; + } catch (...) {} return sycl::malloc_device(size, q.get_device(), q.get_context()); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 1eead625e76..a70d54a7ab4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -30,6 +30,8 @@ #include #include +#include +#include #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC # include #endif @@ -346,6 +348,10 @@ catch (sycl::exception const &exc) { std::exit(1); } +// Forward declarations for Level Zero allocation helpers (defined after this struct) +static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); +static void ggml_sycl_free_device(void *ptr, sycl::queue &q); + // sycl buffer struct ggml_backend_sycl_buffer_context { @@ -366,7 +372,7 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); + ggml_sycl_free_device(dev_ptr, *stream); } //release extra used by tensors @@ -499,8 +505,50 @@ catch (sycl::exception const &exc) { std::exit(1); } +// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering +// DMA-buf/TTM system RAM staging in the xe kernel driver. +// sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation +// via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. +// zeMemAllocDevice uses the SVM/P2P path with no host staging. +static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { + void *ptr = nullptr; + try { + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; + ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); + if (r == ZE_RESULT_SUCCESS && ptr) { + return ptr; + } + } catch (...) {} + return sycl::malloc_device(size, q); +} + +static void ggml_sycl_free_device(void *ptr, sycl::queue &q) { + if (!ptr) return; + try { + auto ze_ctx = sycl::get_native(q.get_context()); + if (zeMemFree(ze_ctx, ptr) == ZE_RESULT_SUCCESS) return; + } catch (...) {} + sycl::free(ptr, q); +} + static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { + try { + auto ze_ctx = sycl::get_native(q_dst.get_context()); + auto ze_dev = sycl::get_native(q_dst.get_device()); + ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, + 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t cl; + ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); + if (r == ZE_RESULT_SUCCESS) { + zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); + zeCommandListDestroy(cl); + return; + } + } catch (...) {} + // Fallback to host-staged copy char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); @@ -669,9 +717,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, const queue_ptr stream = buft_ctx->stream; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 - void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( - size, *stream))); + void * dev_ptr = ggml_sycl_malloc_device(size, *stream); if (!dev_ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); return nullptr; @@ -912,18 +958,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } - // FIXME: do not crash if SYCL Buffer alloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ggml_sycl_set_device(i); const queue_ptr stream = ctx->streams[i]; - char * buf; - /* - DPCT1009:208: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( - size, *stream))); + char * buf = (char *)ggml_sycl_malloc_device(size, *stream); if (!buf) { char err_buf[1024]; snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size); @@ -1284,7 +1321,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { ggml_sycl_buffer & b = buffer_pool[i]; if (b.ptr != nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); + ggml_sycl_free_device(b.ptr, *qptr); pool_size -= b.size; } } @@ -1332,9 +1369,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); - SYCL_CHECK( - CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( - look_ahead_size, *qptr))); + ptr = ggml_sycl_malloc_device(look_ahead_size, *qptr); if (!ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size); return nullptr; @@ -1362,7 +1397,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } } GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); + ggml_sycl_free_device(ptr, *qptr); pool_size -= size; } }; From 2a0278b8078861d36feba5a491806387fdb03d04 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Wed, 8 Apr 2026 04:41:52 -0400 Subject: [PATCH 02/12] SYCL: address review feedback - remove try/catch, check device types, deduplicate - Remove try/catch from malloc/free/memcpy helpers, check backend and device type upfront instead (ggml_sycl_is_level_zero, ggml_sycl_is_dgpu) - Move shared helpers (is_level_zero, is_dgpu, free_device) to common.cpp and declare in common.hpp to eliminate code duplication - Use SYCL_CHECK(CHECK_TRY_ERROR()) for fallback sycl::free calls - Guard dev2dev_memcpy L0 path to dGPU-to-dGPU only, preserving the host-staged path for iGPU-to-dGPU transfers - Add Windows Level Zero SDK path detection (LEVEL_ZERO_V1_SDK_PATH) in CMakeLists.txt (co-authored with @arthw) --- ggml/src/ggml-sycl/CMakeLists.txt | 12 +++++++++++- ggml/src/ggml-sycl/common.cpp | 30 ++++++++++++++++++------------ ggml/src/ggml-sycl/common.hpp | 3 +++ ggml/src/ggml-sycl/ggml-sycl.cpp | 27 ++++++++++----------------- 4 files changed, 42 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index c4dec5d7653..98f42a6525f 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -39,6 +39,16 @@ if (WIN32) set(CMAKE_CXX_COMPILER "icx") set(CMAKE_CXX_COMPILER_ID "IntelLLVM") endif() + # Level Zero SDK path for Windows + if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH}) + set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH}) + if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}") + target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include") + set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib") + else() + message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}") + endif() + endif() endif() macro(detect_and_find_package package_name) @@ -96,7 +106,7 @@ target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") # Link against Level Zero loader for direct device memory allocation. # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging # in the xe kernel driver during multi-GPU inference. -find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ENV LD_LIBRARY_PATH) +find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH) if(ZE_LOADER_LIB) target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 7d5caa75623..1b3eb24c18b 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -68,6 +68,23 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } +bool ggml_sycl_is_level_zero(sycl::queue &q) { + return q.get_backend() == sycl::backend::ext_oneapi_level_zero; +} + +bool ggml_sycl_is_dgpu(sycl::queue &q) { + return !q.get_device().get_info(); +} + +void ggml_sycl_free_device(void *ptr, sycl::queue &q) { + if (!ptr) return; + if (ggml_sycl_is_level_zero(q)) { + auto ze_ctx = sycl::get_native(q.get_context()); + if (zeMemFree(ze_ctx, ptr) == ZE_RESULT_SUCCESS) return; + } + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q))); +} + void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { @@ -77,18 +94,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str } if (extra->data_device[i] != nullptr && streams.size()>0) { ggml_sycl_set_device(i); - bool freed = false; - try { - auto ze_ctx = sycl::get_native( - streams[i]->get_context()); - if (zeMemFree(ze_ctx, extra->data_device[i]) == ZE_RESULT_SUCCESS) { - freed = true; - } - } catch (...) {} - if (!freed) { - SYCL_CHECK( - CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); - } + ggml_sycl_free_device(extra->data_device[i], *(streams[i])); } } delete extra; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5abf2290651..5a32c95ee61 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -309,6 +309,9 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; +bool ggml_sycl_is_level_zero(sycl::queue &q); +bool ggml_sycl_is_dgpu(sycl::queue &q); +void ggml_sycl_free_device(void *ptr, sycl::queue &q); void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); namespace sycl_ex = sycl::ext::oneapi::experimental; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a70d54a7ab4..e4f340ed47f 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -348,9 +348,9 @@ catch (sycl::exception const &exc) { std::exit(1); } -// Forward declarations for Level Zero allocation helpers (defined after this struct) +// Forward declaration for Level Zero allocation helper (defined below) static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); -static void ggml_sycl_free_device(void *ptr, sycl::queue &q); +// ggml_sycl_free_device and ggml_sycl_is_level_zero/dgpu are in common.hpp/common.cpp // sycl buffer @@ -511,8 +511,8 @@ catch (sycl::exception const &exc) { // via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. // zeMemAllocDevice uses the SVM/P2P path with no host staging. static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { - void *ptr = nullptr; - try { + if (ggml_sycl_is_level_zero(q) && ggml_sycl_is_dgpu(q)) { + void *ptr = nullptr; auto ze_ctx = sycl::get_native(q.get_context()); auto ze_dev = sycl::get_native(q.get_device()); ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; @@ -520,22 +520,15 @@ static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { if (r == ZE_RESULT_SUCCESS && ptr) { return ptr; } - } catch (...) {} + } return sycl::malloc_device(size, q); } -static void ggml_sycl_free_device(void *ptr, sycl::queue &q) { - if (!ptr) return; - try { - auto ze_ctx = sycl::get_native(q.get_context()); - if (zeMemFree(ze_ctx, ptr) == ZE_RESULT_SUCCESS) return; - } catch (...) {} - sycl::free(ptr, q); -} - static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { - try { + // Use Level Zero direct copy for dGPU-to-dGPU transfers. + // The legacy host-staged path supports iGPU-to-dGPU copies. + if (ggml_sycl_is_level_zero(q_dst) && ggml_sycl_is_dgpu(q_dst) && ggml_sycl_is_dgpu(q_src)) { auto ze_ctx = sycl::get_native(q_dst.get_context()); auto ze_dev = sycl::get_native(q_dst.get_device()); ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, @@ -547,8 +540,8 @@ static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst zeCommandListDestroy(cl); return; } - } catch (...) {} - // Fallback to host-staged copy + } + // Fallback: host-staged copy (supports iGPU, non-L0 backends) char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); From 70cf28646c387ba2232c53206f893813561132c8 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Wed, 8 Apr 2026 08:37:26 -0400 Subject: [PATCH 03/12] SYCL: add build/runtime flags for Level Zero, address review feedback Implements the architecture suggested by @arthw: compile-time and runtime flags to cleanly separate Level Zero and SYCL memory API paths. - Add GGML_SYCL_SUPPORT_LEVEL_ZERO cmake option (default ON). All Level Zero code is wrapped in #ifdef so the build works on systems without the Level Zero SDK installed (e.g. CPU-only CI servers). Both the loader library and headers are checked before enabling. - Add GGML_SYCL_ENABLE_LEVEL_ZERO runtime env var (default 1). Controls whether Level Zero or SYCL memory APIs are used. Only one API style is used per session, no mixing. If Level Zero is enabled but the devices don't support the Level Zero backend, it auto-disables with a warning. - Remove Level Zero code from dpct_malloc. It was unused (dpct::device_memory is not called anywhere in the backend) and used try/catch for flow control. - Update SYCL.md with documentation for both new parameters. Tested on Intel Arc Pro B70 (32GB), single-GPU and dual-GPU, with both GGML_SYCL_SUPPORT_LEVEL_ZERO=ON and OFF builds. AI-assisted development (Claude). Code reviewed and tested on my hardware. --- docs/backend/SYCL.md | 2 + ggml/CMakeLists.txt | 1 + ggml/src/ggml-sycl/CMakeLists.txt | 42 ++++++++++------- ggml/src/ggml-sycl/common.cpp | 21 +++++---- ggml/src/ggml-sycl/common.hpp | 5 +- ggml/src/ggml-sycl/dpct/helper.hpp | 10 ---- ggml/src/ggml-sycl/ggml-sycl.cpp | 74 +++++++++++++++++++++++++++--- 7 files changed, 109 insertions(+), 46 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 7ebb4ec0297..9a5941b753d 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. | | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. | +| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Intel GPU driver (Level Zero runtime) installed. Reduces system RAM usage during multi-GPU inference. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -733,6 +734,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.| | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. | +| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. | | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.| diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index f7b6f1f334f..a49eeb4141f 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -249,6 +249,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON) +option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero for device memory in SYCL" ON) option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 98f42a6525f..8eb5fd1df13 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -39,14 +39,16 @@ if (WIN32) set(CMAKE_CXX_COMPILER "icx") set(CMAKE_CXX_COMPILER_ID "IntelLLVM") endif() - # Level Zero SDK path for Windows - if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH}) - set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH}) - if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}") - target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include") - set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib") - else() - message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}") + # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled) + if(GGML_SYCL_SUPPORT_LEVEL_ZERO) + if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH}) + set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH}) + if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}") + target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include") + set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib") + else() + message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}") + endif() endif() endif() endif() @@ -103,15 +105,21 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") -# Link against Level Zero loader for direct device memory allocation. -# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging -# in the xe kernel driver during multi-GPU inference. -find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH) -if(ZE_LOADER_LIB) - target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) - message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") -else() - message(WARNING "Level Zero loader (ze_loader) not found, multi-GPU may use excessive system RAM") +if (GGML_SYCL_SUPPORT_LEVEL_ZERO) + message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO enabled") + # Link against Level Zero loader for direct device memory allocation. + # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging + # in the xe kernel driver during multi-GPU inference. + find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h HINTS ${ONEAPI_ROOT}/include ${LEVEL_ZERO_V1_SDK_PATH}/include) + find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH) + if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR) + target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO) + message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") + message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}") + else() + message(WARNING "Level Zero loader or headers not found, Level Zero support disabled") + endif() endif() # Link against oneDNN diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 1b3eb24c18b..d0d4c3c12cc 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -11,8 +11,10 @@ // #include "common.hpp" +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include #include +#endif #include "ggml-backend-impl.h" #include "ggml-impl.h" @@ -68,22 +70,17 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } -bool ggml_sycl_is_level_zero(sycl::queue &q) { - return q.get_backend() == sycl::backend::ext_oneapi_level_zero; -} - -bool ggml_sycl_is_dgpu(sycl::queue &q) { - return !q.get_device().get_info(); -} - +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; - if (ggml_sycl_is_level_zero(q)) { + if (g_ggml_sycl_enable_level_zero) { auto ze_ctx = sycl::get_native(q.get_context()); - if (zeMemFree(ze_ctx, ptr) == ZE_RESULT_SUCCESS) return; + zeMemFree(ze_ctx, ptr); + return; } SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q))); } +#endif void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { for (int i = 0; i < ggml_sycl_info().device_count; ++i) { @@ -94,7 +91,11 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str } if (extra->data_device[i] != nullptr && streams.size()>0) { ggml_sycl_set_device(i); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO ggml_sycl_free_device(extra->data_device[i], *(streams[i])); +#else + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); +#endif } } delete extra; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5a32c95ee61..b918adf2d23 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -309,9 +309,10 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; -bool ggml_sycl_is_level_zero(sycl::queue &q); -bool ggml_sycl_is_dgpu(sycl::queue &q); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +extern int g_ggml_sycl_enable_level_zero; void ggml_sycl_free_device(void *ptr, sycl::queue &q); +#endif void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); namespace sycl_ex = sycl::ext::oneapi::experimental; diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 055cb309735..791d3cac52e 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -15,8 +15,6 @@ #include #include -#include -#include #include #include @@ -1309,14 +1307,6 @@ namespace dpct static inline void *dpct_malloc(size_t size, sycl::queue &q) { - try { - auto ze_ctx = sycl::get_native(q.get_context()); - auto ze_dev = sycl::get_native(q.get_device()); - ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; - void *ptr = nullptr; - if (zeMemAllocDevice(ze_ctx, &desc, size, 64, ze_dev, &ptr) == ZE_RESULT_SUCCESS && ptr) - return ptr; - } catch (...) {} return sycl::malloc_device(size, q.get_device(), q.get_context()); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index e4f340ed47f..2b1dfb3b0c5 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -30,8 +30,10 @@ #include #include +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include #include +#endif #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC # include #endif @@ -65,6 +67,9 @@ int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +int g_ggml_sycl_enable_level_zero = 0; +#endif int g_ggml_sycl_enable_flash_attention = 1; @@ -220,6 +225,20 @@ static void ggml_check_sycl() try { g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); + if (g_ggml_sycl_enable_level_zero) { + // Verify all devices use the Level Zero backend before enabling L0 APIs + for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) { + auto & q = dpct::dev_mgr::instance().get_device(i).default_queue(); + if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) { + GGML_LOG_WARN("SYCL device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); + g_ggml_sycl_enable_level_zero = 0; + break; + } + } + } +#endif #ifdef SYCL_FLASH_ATTN g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); @@ -250,6 +269,11 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n"); #endif +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n"); +#else + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n"); +#endif GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); @@ -259,6 +283,11 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n"); #endif +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero); +#else + GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n"); +#endif #if GGML_SYCL_DNNL GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); #else @@ -348,9 +377,10 @@ catch (sycl::exception const &exc) { std::exit(1); } +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO // Forward declaration for Level Zero allocation helper (defined below) static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); -// ggml_sycl_free_device and ggml_sycl_is_level_zero/dgpu are in common.hpp/common.cpp +#endif // sycl buffer @@ -372,7 +402,11 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO ggml_sycl_free_device(dev_ptr, *stream); +#else + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); +#endif } //release extra used by tensors @@ -505,13 +539,14 @@ catch (sycl::exception const &exc) { std::exit(1); } +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO // Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering // DMA-buf/TTM system RAM staging in the xe kernel driver. // sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation // via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. // zeMemAllocDevice uses the SVM/P2P path with no host staging. static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { - if (ggml_sycl_is_level_zero(q) && ggml_sycl_is_dgpu(q)) { + if (g_ggml_sycl_enable_level_zero) { void *ptr = nullptr; auto ze_ctx = sycl::get_native(q.get_context()); auto ze_dev = sycl::get_native(q.get_device()); @@ -520,15 +555,17 @@ static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { if (r == ZE_RESULT_SUCCESS && ptr) { return ptr; } + return nullptr; } return sycl::malloc_device(size, q); } +#endif static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO // Use Level Zero direct copy for dGPU-to-dGPU transfers. - // The legacy host-staged path supports iGPU-to-dGPU copies. - if (ggml_sycl_is_level_zero(q_dst) && ggml_sycl_is_dgpu(q_dst) && ggml_sycl_is_dgpu(q_src)) { + if (g_ggml_sycl_enable_level_zero) { auto ze_ctx = sycl::get_native(q_dst.get_context()); auto ze_dev = sycl::get_native(q_dst.get_device()); ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, @@ -541,7 +578,8 @@ static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst return; } } - // Fallback: host-staged copy (supports iGPU, non-L0 backends) +#endif + // Host-staged copy char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); @@ -710,7 +748,12 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, const queue_ptr stream = buft_ctx->stream; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 - void * dev_ptr = ggml_sycl_malloc_device(size, *stream); + void * dev_ptr; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + dev_ptr = ggml_sycl_malloc_device(size, *stream); +#else + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(size, *stream))); +#endif if (!dev_ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); return nullptr; @@ -953,7 +996,12 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_sycl_set_device(i); const queue_ptr stream = ctx->streams[i]; - char * buf = (char *)ggml_sycl_malloc_device(size, *stream); + char * buf; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + buf = (char *)ggml_sycl_malloc_device(size, *stream); +#else + SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(size, *stream))); +#endif if (!buf) { char err_buf[1024]; snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size); @@ -1314,7 +1362,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { ggml_sycl_buffer & b = buffer_pool[i]; if (b.ptr != nullptr) { +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO ggml_sycl_free_device(b.ptr, *qptr); +#else + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); +#endif pool_size -= b.size; } } @@ -1362,7 +1414,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO ptr = ggml_sycl_malloc_device(look_ahead_size, *qptr); +#else + SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(look_ahead_size, *qptr))); +#endif if (!ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size); return nullptr; @@ -1390,7 +1446,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } } GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO ggml_sycl_free_device(ptr, *qptr); +#else + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); +#endif pool_size -= size; } }; From 89c7bae994ee3c36885c5c9bbe2deef4e3efa1e7 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Thu, 9 Apr 2026 05:28:55 -0400 Subject: [PATCH 04/12] SYCL: unify Level Zero malloc/free call sites, address review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ggml_sycl_malloc_device to common.cpp alongside ggml_sycl_free_device. Both functions are now unconditionally available — Level Zero code is #ifdef'd inside the functions, not at call sites. All call sites use uniform SYCL_CHECK(CHECK_TRY_ERROR()) wrapping with no #ifdef blocks. Addresses arthw's review: wrap all malloc/free in SYCL_CHECK for stack traces on failure, eliminate duplicated #ifdef/else patterns at 6 call sites (-29 lines net). Co-Authored-By: Claude Opus 4.6 (1M context) --- ggml/src/ggml-sycl/common.cpp | 32 ++++++++++++---- ggml/src/ggml-sycl/common.hpp | 6 ++- ggml/src/ggml-sycl/ggml-sycl.cpp | 63 +++----------------------------- 3 files changed, 36 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index d0d4c3c12cc..980acb1f6ce 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -70,17 +70,39 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } +// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering +// DMA-buf/TTM system RAM staging in the xe kernel driver. +// sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation +// via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. +// zeMemAllocDevice uses the SVM/P2P path with no host staging. +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + if (g_ggml_sycl_enable_level_zero) { + void *ptr = nullptr; + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; + ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); + if (r == ZE_RESULT_SUCCESS && ptr) { + return ptr; + } + return nullptr; + } +#endif + return sycl::malloc_device(size, q); +} + void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO if (g_ggml_sycl_enable_level_zero) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); return; } - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q))); -} #endif + sycl::free(ptr, q); +} void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { for (int i = 0; i < ggml_sycl_info().device_count; ++i) { @@ -91,11 +113,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str } if (extra->data_device[i] != nullptr && streams.size()>0) { ggml_sycl_set_device(i); -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - ggml_sycl_free_device(extra->data_device[i], *(streams[i])); -#else - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i])))); } } delete extra; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index b918adf2d23..8de88244c06 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -311,8 +311,12 @@ struct ggml_tensor_extra_gpu { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO extern int g_ggml_sycl_enable_level_zero; -void ggml_sycl_free_device(void *ptr, sycl::queue &q); #endif + +// Call Level Zero or SYCL allocation API +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); +void ggml_sycl_free_device(void *ptr, sycl::queue &q); + void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); namespace sycl_ex = sycl::ext::oneapi::experimental; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 2b1dfb3b0c5..bd617053985 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -377,11 +377,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO -// Forward declaration for Level Zero allocation helper (defined below) -static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); -#endif - // sycl buffer struct ggml_backend_sycl_buffer_context { @@ -402,11 +397,7 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - ggml_sycl_free_device(dev_ptr, *stream); -#else - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); } //release extra used by tensors @@ -539,28 +530,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO -// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering -// DMA-buf/TTM system RAM staging in the xe kernel driver. -// sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation -// via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. -// zeMemAllocDevice uses the SVM/P2P path with no host staging. -static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { - if (g_ggml_sycl_enable_level_zero) { - void *ptr = nullptr; - auto ze_ctx = sycl::get_native(q.get_context()); - auto ze_dev = sycl::get_native(q.get_device()); - ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; - ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); - if (r == ZE_RESULT_SUCCESS && ptr) { - return ptr; - } - return nullptr; - } - return sycl::malloc_device(size, q); -} -#endif - static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO @@ -749,11 +718,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 void * dev_ptr; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - dev_ptr = ggml_sycl_malloc_device(size, *stream); -#else - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(size, *stream))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); if (!dev_ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); return nullptr; @@ -997,11 +962,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_sycl_set_device(i); const queue_ptr stream = ctx->streams[i]; char * buf; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - buf = (char *)ggml_sycl_malloc_device(size, *stream); -#else - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(size, *stream))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream))); if (!buf) { char err_buf[1024]; snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size); @@ -1362,11 +1323,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { ggml_sycl_buffer & b = buffer_pool[i]; if (b.ptr != nullptr) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - ggml_sycl_free_device(b.ptr, *qptr); -#else - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr))); pool_size -= b.size; } } @@ -1414,11 +1371,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - ptr = ggml_sycl_malloc_device(look_ahead_size, *qptr); -#else - SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(look_ahead_size, *qptr))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr))); if (!ptr) { GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size); return nullptr; @@ -1446,11 +1399,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } } GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - ggml_sycl_free_device(ptr, *qptr); -#else - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); -#endif + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr))); pool_size -= size; } }; From 279cbbf663e60680eaedf28be8eac6ee44040330 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Mon, 13 Apr 2026 08:58:08 -0400 Subject: [PATCH 05/12] SYCL: add Level Zero SDK to CI, fix device check and missed alloc paths Add Level Zero SDK installation to Ubuntu and Windows SYCL CI jobs so the Level Zero code path is compiled and tested in CI. Fix two bugs found during extended dual-GPU testing (no ONEAPI_DEVICE_SELECTOR set): - The Level Zero backend check was iterating all SYCL devices including CPU. The OpenCL CPU device caused Level Zero to be disabled for the GPUs, defeating the fix on multi-GPU systems. Added is_gpu() filter so only GPU devices are checked. - sycl_ext_malloc_device/sycl_ext_free (tensor reorder temp buffers) were still calling sycl::malloc/sycl::free directly, bypassing the Level Zero path. Routed through ggml_sycl_malloc_device/free_device for consistency with the other device memory call sites. Co-Authored-By: Claude Opus 4.6 (1M context) --- .devops/intel.Dockerfile | 3 +-- .github/workflows/build-sycl.yml | 15 +++++++++++++ .github/workflows/release.yml | 18 ++++++++++++++++ docs/backend/SYCL.md | 2 +- ggml/src/ggml-sycl/common.cpp | 19 ++++++++++++++-- ggml/src/ggml-sycl/ggml-sycl.cpp | 37 ++++++++++++++++++++++++++------ 6 files changed, 82 insertions(+), 12 deletions(-) diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 8e830d46251..e56ab301042 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ - apt-get install -y git libssl-dev + apt-get install -y git libssl-dev libze-dev WORKDIR /app @@ -109,4 +109,3 @@ WORKDIR /app HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] ENTRYPOINT [ "/app/llama-server" ] - diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml index 2a6642292e6..b6015c3eba2 100644 --- a/.github/workflows/build-sycl.yml +++ b/.github/workflows/build-sycl.yml @@ -71,6 +71,12 @@ jobs: wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept + - name: Install Level Zero SDK + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y libze-dev + - name: Clone id: checkout uses: actions/checkout@v6 @@ -127,6 +133,15 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + - name: Install Level Zero SDK + shell: pwsh + run: | + $release = Invoke-RestMethod -Uri "https://api.github.com/repos/oneapi-src/level-zero/releases/latest" + $asset = $release.assets | Where-Object { $_.name -like "level-zero-win-sdk*.zip" } | Select-Object -First 1 + Invoke-WebRequest -Uri $asset.browser_download_url -OutFile "level-zero-win-sdk.zip" + Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force + "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 924f6cd3fe3..1f2be55275c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -621,6 +621,15 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + - name: Install Level Zero SDK + shell: pwsh + run: | + $release = Invoke-RestMethod -Uri "https://api.github.com/repos/oneapi-src/level-zero/releases/latest" + $asset = $release.assets | Where-Object { $_.name -like "level-zero-win-sdk*.zip" } | Select-Object -First 1 + Invoke-WebRequest -Uri $asset.browser_download_url -OutFile "level-zero-win-sdk.zip" + Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force + "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: @@ -655,6 +664,9 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + ZE_LOADER_DLL=$(find "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll | head -n 1) + test -n "$ZE_LOADER_DLL" + cp "$ZE_LOADER_DLL" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin @@ -718,6 +730,12 @@ jobs: wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept + - name: Install Level Zero SDK + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y libze-dev + - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 9a5941b753d..078c17a5fb1 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -720,7 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. | | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. | -| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Intel GPU driver (Level Zero runtime) installed. Reduces system RAM usage during multi-GPU inference. | +| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 980acb1f6ce..0c9a239c350 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -16,6 +16,9 @@ #include #endif +#include +#include + #include "ggml-backend-impl.h" #include "ggml-impl.h" @@ -75,9 +78,21 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block // sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation // via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. // zeMemAllocDevice uses the SVM/P2P path with no host staging. +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +static bool ggml_sycl_queue_supports_level_zero(sycl::queue &q) { + const char * env = std::getenv("GGML_SYCL_ENABLE_LEVEL_ZERO"); + unsigned int enabled = 1; + if (env && sscanf(env, " %u", &enabled) != 1) { + enabled = 1; + } + + return enabled && q.get_device().is_gpu() && q.get_backend() == sycl::backend::ext_oneapi_level_zero; +} +#endif + void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (g_ggml_sycl_enable_level_zero) { + if (ggml_sycl_queue_supports_level_zero(q)) { void *ptr = nullptr; auto ze_ctx = sycl::get_native(q.get_context()); auto ze_dev = sycl::get_native(q.get_device()); @@ -95,7 +110,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (g_ggml_sycl_enable_level_zero) { + if (ggml_sycl_queue_supports_level_zero(q)) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); return; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index bd617053985..51574d564bb 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -228,11 +228,16 @@ static void ggml_check_sycl() try { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); if (g_ggml_sycl_enable_level_zero) { - // Verify all devices use the Level Zero backend before enabling L0 APIs + // Verify all GPU devices use the Level Zero backend before enabling L0 APIs + // Only check GPU devices; CPU devices use OpenCL backend and would + // incorrectly disable Level Zero for the GPUs that need it. for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) { auto & q = dpct::dev_mgr::instance().get_device(i).default_queue(); + if (!q.get_device().is_gpu()) { + continue; + } if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) { - GGML_LOG_WARN("SYCL device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); + GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); g_ggml_sycl_enable_level_zero = 0; break; } @@ -530,11 +535,27 @@ catch (sycl::exception const &exc) { std::exit(1); } +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) { + if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) { + return false; + } + + ze_device_handle_t ze_dev = sycl::get_native(q.get_device()); + ze_device_properties_t props = {}; + props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + ze_result_t r = zeDeviceGetProperties(ze_dev, &props); + return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); +} +#endif + static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO // Use Level Zero direct copy for dGPU-to-dGPU transfers. - if (g_ggml_sycl_enable_level_zero) { + const bool l0_copy_supported = + ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src); + if (g_ggml_sycl_enable_level_zero && l0_copy_supported) { auto ze_ctx = sycl::get_native(q_dst.get_context()); auto ze_dev = sycl::get_native(q_dst.get_device()); ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, @@ -542,9 +563,11 @@ static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst ze_command_list_handle_t cl; ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); if (r == ZE_RESULT_SUCCESS) { - zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); + r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); zeCommandListDestroy(cl); - return; + if (r == ZE_RESULT_SUCCESS) { + return; + } } } #endif @@ -3394,7 +3417,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) // If async allocation extension is not available, use_async should always be false. GGML_ASSERT(!use_async); #endif - return sycl::malloc(size, *stream, sycl::usm::alloc::device); + return ggml_sycl_malloc_device(size, *stream); } static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) { @@ -3408,7 +3431,7 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) { // If async allocation extension is not available, use_async should always be false. GGML_ASSERT(!use_async); #endif - sycl::free(ptr, *stream); + ggml_sycl_free_device(ptr, *stream); } // RAII wrapper for temporary reorder buffers with optional host memory fallback. From b13c39e8ba414b4527c728a3aa7f813fc63bbd76 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Thu, 7 May 2026 09:38:38 -0400 Subject: [PATCH 06/12] SYCL: address arthw review feedback on Level Zero memory API structure - Move ggml_sycl_malloc_device to static function in ggml-sycl.cpp; only ggml_sycl_free_device (used by common.cpp) stays in common.cpp - Switch both helpers to use g_ggml_sycl_enable_level_zero global instead of per-call queue backend checks - Remove #ifdef wrapper from global definition; always declare at 0, add #else branch in init block so it stays 0 when L0 not compiled in - Update init loop comment to explain GPU-only device check - CMakeLists: message(STATUS) before the if block; align option wording AI-assisted implementation. Reviewed and tested on dual Intel Arc Pro B70 (32 GB each): test-backend-ops OK on both GPUs, single/dual-GPU Q4_K_M and Q8_0 bench correct, zeMemAllocDevice GTT delta confirmed <5 MiB per 4 GiB allocation (vs ~4 GiB shadow with sycl::malloc_device). Co-Authored-By: Claude Sonnet 4.6 --- ggml/CMakeLists.txt | 2 +- ggml/src/ggml-sycl/CMakeLists.txt | 2 +- ggml/src/ggml-sycl/common.cpp | 38 ++----------------------------- ggml/src/ggml-sycl/common.hpp | 5 ---- ggml/src/ggml-sycl/ggml-sycl.cpp | 32 +++++++++++++++++++++----- 5 files changed, 30 insertions(+), 49 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index a49eeb4141f..b929460a8cc 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON) -option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero for device memory in SYCL" ON) +option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON) option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 8eb5fd1df13..6285d714267 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -105,8 +105,8 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") +message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}") if (GGML_SYCL_SUPPORT_LEVEL_ZERO) - message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO enabled") # Link against Level Zero loader for direct device memory allocation. # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging # in the xe kernel driver during multi-GPU inference. diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 0c9a239c350..4fb48aba25d 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -73,50 +73,16 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } -// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering -// DMA-buf/TTM system RAM staging in the xe kernel driver. -// sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation -// via xe_gem_prime_export, consuming system RAM equal to VRAM allocated. -// zeMemAllocDevice uses the SVM/P2P path with no host staging. -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO -static bool ggml_sycl_queue_supports_level_zero(sycl::queue &q) { - const char * env = std::getenv("GGML_SYCL_ENABLE_LEVEL_ZERO"); - unsigned int enabled = 1; - if (env && sscanf(env, " %u", &enabled) != 1) { - enabled = 1; - } - - return enabled && q.get_device().is_gpu() && q.get_backend() == sycl::backend::ext_oneapi_level_zero; -} -#endif - -void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (ggml_sycl_queue_supports_level_zero(q)) { - void *ptr = nullptr; - auto ze_ctx = sycl::get_native(q.get_context()); - auto ze_dev = sycl::get_native(q.get_device()); - ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; - ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); - if (r == ZE_RESULT_SUCCESS && ptr) { - return ptr; - } - return nullptr; - } -#endif - return sycl::malloc_device(size, q); -} - void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (ggml_sycl_queue_supports_level_zero(q)) { + if (g_ggml_sycl_enable_level_zero) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); return; } #endif - sycl::free(ptr, q); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q))); } void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 8de88244c06..697b99b3228 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -309,12 +309,7 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO extern int g_ggml_sycl_enable_level_zero; -#endif - -// Call Level Zero or SYCL allocation API -void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); void ggml_sycl_free_device(void *ptr, sycl::queue &q); void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 51574d564bb..ff78873b332 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -67,9 +67,7 @@ int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO int g_ggml_sycl_enable_level_zero = 0; -#endif int g_ggml_sycl_enable_flash_attention = 1; @@ -227,10 +225,13 @@ static void ggml_check_sycl() try { g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); +#else + g_ggml_sycl_enable_level_zero = 0; +#endif if (g_ggml_sycl_enable_level_zero) { - // Verify all GPU devices use the Level Zero backend before enabling L0 APIs - // Only check GPU devices; CPU devices use OpenCL backend and would - // incorrectly disable Level Zero for the GPUs that need it. + // Verify all GPU devices use the Level Zero backend before enabling L0 APIs. + // Only check GPU devices; CPU devices use OpenCL and would otherwise + // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set. for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) { auto & q = dpct::dev_mgr::instance().get_device(i).default_queue(); if (!q.get_device().is_gpu()) { @@ -243,7 +244,6 @@ static void ggml_check_sycl() try { } } } -#endif #ifdef SYCL_FLASH_ATTN g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); @@ -549,6 +549,26 @@ static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) { } #endif +// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering +// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference. +// zeMemAllocDevice uses the SVM/P2P path with no host staging. +static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + if (g_ggml_sycl_enable_level_zero) { + void *ptr = nullptr; + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; + ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); + if (r == ZE_RESULT_SUCCESS && ptr) { + return ptr; + } + return nullptr; + } +#endif + return sycl::malloc_device(size, q); +} + static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO From c16d3f50ba70ed21c8009875ffe3c4c9acfebdd0 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Thu, 7 May 2026 11:35:12 -0400 Subject: [PATCH 07/12] SYCL: remove unused cstdio/cstdlib includes from common.cpp Leftover from the deleted ggml_sycl_queue_supports_level_zero helper. Co-authored-by: Claude Sonnet 4.6 --- ggml/src/ggml-sycl/common.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 4fb48aba25d..f497772dfcc 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -16,9 +16,6 @@ #include #endif -#include -#include - #include "ggml-backend-impl.h" #include "ggml-impl.h" From b230d160e0a1da8d30fa42011890628b6436d653 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Fri, 8 May 2026 14:02:41 +0800 Subject: [PATCH 08/12] Apply suggestions from code review Co-authored-by: Neo Zhang --- ggml/src/ggml-sycl/common.cpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index f497772dfcc..705368b812c 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -11,8 +11,8 @@ // #include "common.hpp" -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include #endif diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ff78873b332..48f89447fd9 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -30,8 +30,8 @@ #include #include -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO #include #endif #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC From 85cac5005193cef2f0bf34037a192297569de5ae Mon Sep 17 00:00:00 2001 From: PMZFX Date: Fri, 8 May 2026 04:42:14 -0400 Subject: [PATCH 09/12] SYCL: preserve Level Zero allocation path during early malloc --- .devops/intel.Dockerfile | 9 ++++- .github/workflows/build-sycl.yml | 13 ++++--- .github/workflows/release.yml | 13 ++++--- docs/backend/SYCL.md | 4 +-- ggml/src/ggml-sycl/common.cpp | 59 +++++++++++++++++++++++++++++++- ggml/src/ggml-sycl/common.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 20 ----------- 7 files changed, 85 insertions(+), 34 deletions(-) diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index e56ab301042..2f76337fb8a 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF +ARG LEVEL_ZERO_VERSION=1.28.2 +ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04 RUN apt-get update && \ - apt-get install -y git libssl-dev libze-dev + apt-get install -y git libssl-dev wget ca-certificates && \ + cd /tmp && \ + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \ + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \ + apt-get install -y ./level-zero.deb ./level-zero-devel.deb && \ + rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb WORKDIR /app diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml index b6015c3eba2..09635f64edb 100644 --- a/.github/workflows/build-sycl.yml +++ b/.github/workflows/build-sycl.yml @@ -50,6 +50,8 @@ jobs: env: ONEAPI_ROOT: /opt/intel/oneapi/ ONEAPI_INSTALLER_VERSION: "2025.3.3" + LEVEL_ZERO_VERSION: "1.28.2" + LEVEL_ZERO_UBUNTU_VERSION: "u24.04" continue-on-error: true @@ -74,8 +76,10 @@ jobs: - name: Install Level Zero SDK shell: bash run: | - sudo apt-get update - sudo apt-get install -y libze-dev + cd /tmp + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb + sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - name: Clone id: checkout @@ -113,6 +117,7 @@ jobs: env: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" ONEAPI_INSTALLER_VERSION: "2025.3.3" steps: @@ -136,9 +141,7 @@ jobs: - name: Install Level Zero SDK shell: pwsh run: | - $release = Invoke-RestMethod -Uri "https://api.github.com/repos/oneapi-src/level-zero/releases/latest" - $asset = $release.assets | Where-Object { $_.name -like "level-zero-win-sdk*.zip" } | Select-Object -First 1 - Invoke-WebRequest -Uri $asset.browser_download_url -OutFile "level-zero-win-sdk.zip" + Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1f2be55275c..e98ffdf4a2c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -600,6 +600,7 @@ jobs: env: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" ONEAPI_INSTALLER_VERSION: "2025.3.3" @@ -624,9 +625,7 @@ jobs: - name: Install Level Zero SDK shell: pwsh run: | - $release = Invoke-RestMethod -Uri "https://api.github.com/repos/oneapi-src/level-zero/releases/latest" - $asset = $release.assets | Where-Object { $_.name -like "level-zero-win-sdk*.zip" } | Select-Object -First 1 - Invoke-WebRequest -Uri $asset.browser_download_url -OutFile "level-zero-win-sdk.zip" + Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append @@ -707,6 +706,8 @@ jobs: env: ONEAPI_ROOT: /opt/intel/oneapi/ ONEAPI_INSTALLER_VERSION: "2025.3.3" + LEVEL_ZERO_VERSION: "1.28.2" + LEVEL_ZERO_UBUNTU_VERSION: "u24.04" steps: - name: Clone @@ -733,8 +734,10 @@ jobs: - name: Install Level Zero SDK shell: bash run: | - sudo apt-get update - sudo apt-get install -y libze-dev + cd /tmp + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb + sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - name: ccache uses: ggml-org/ccache-action@v1.2.21 diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 078c17a5fb1..105110943ec 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -737,7 +737,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. | | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | -| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.| +| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. | ## Design Rule @@ -813,7 +813,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 - `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device` - You need to enable to support 4GB memory malloc by: + With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by: ``` export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 705368b812c..ae08abad81b 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -59,6 +59,20 @@ bool gpu_has_xmx(sycl::device &dev) { return dev.has(sycl::aspect::ext_intel_matrix); } +static int ggml_sycl_get_env(const char *env_name, int default_val) { + char *user_device_string = getenv(env_name); + int user_number = default_val; + + unsigned n; + if (user_device_string != NULL && + sscanf(user_device_string, " %u", &n) == 1) { + user_number = (int)n; + } else { + user_number = default_val; + } + return user_number; +} + int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { const int64_t max_range = std::numeric_limits::max(); int64_t sycl_down_blk_size = block_size; @@ -70,10 +84,53 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) { + return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) && + q.get_device().is_gpu() && + q.get_backend() == sycl::backend::ext_oneapi_level_zero; +} +#endif + +// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering +// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference. +// The decision is made from the queue and runtime env because large buffers can be +// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero. +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + if (ggml_sycl_use_level_zero_device_alloc(q)) { + void *ptr = nullptr; + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); +#ifdef ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME + ze_relaxed_allocation_limits_exp_desc_t relaxed_desc = { + ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, + nullptr, + ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE, + }; + ze_device_mem_alloc_desc_t alloc_desc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + &relaxed_desc, + 0, + 0, + }; +#else + ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; +#endif + ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); + if (r == ZE_RESULT_SUCCESS && ptr) { + return ptr; + } + return nullptr; + } +#endif + return sycl::malloc_device(size, q); +} + void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (g_ggml_sycl_enable_level_zero) { + if (ggml_sycl_use_level_zero_device_alloc(q)) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); return; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 697b99b3228..298e873d01d 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -310,6 +310,7 @@ struct ggml_tensor_extra_gpu { }; extern int g_ggml_sycl_enable_level_zero; +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); void ggml_sycl_free_device(void *ptr, sycl::queue &q); void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 48f89447fd9..338dbec2913 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -549,26 +549,6 @@ static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) { } #endif -// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering -// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference. -// zeMemAllocDevice uses the SVM/P2P path with no host staging. -static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - if (g_ggml_sycl_enable_level_zero) { - void *ptr = nullptr; - auto ze_ctx = sycl::get_native(q.get_context()); - auto ze_dev = sycl::get_native(q.get_device()); - ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0}; - ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); - if (r == ZE_RESULT_SUCCESS && ptr) { - return ptr; - } - return nullptr; - } -#endif - return sycl::malloc_device(size, q); -} - static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO From daf701ae9e66e546b84ee486ec2b4d1e33f8ef88 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Fri, 8 May 2026 06:57:21 -0400 Subject: [PATCH 10/12] ci: fix Level Zero package conflict in Intel Docker build --- .devops/intel.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 2f76337fb8a..9fce42341d1 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update && \ cd /tmp && \ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \ wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \ - apt-get install -y ./level-zero.deb ./level-zero-devel.deb && \ + apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \ rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb WORKDIR /app From 021f1a566e8584faf9340c3f866e810bc6b214a8 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Fri, 8 May 2026 07:34:37 -0400 Subject: [PATCH 11/12] ci: find Level Zero loader in oneAPI package step --- .github/workflows/release.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e98ffdf4a2c..1036e72ef8d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -663,8 +663,9 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - ZE_LOADER_DLL=$(find "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll | head -n 1) + ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) test -n "$ZE_LOADER_DLL" + echo "Using Level Zero loader: $ZE_LOADER_DLL" cp "$ZE_LOADER_DLL" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin From 8bead87e960bee9fa51d07b36fd01975122b79cd Mon Sep 17 00:00:00 2001 From: PMZFX Date: Fri, 8 May 2026 07:57:42 -0400 Subject: [PATCH 12/12] ci: allow Windows SYCL package without Level Zero DLL --- .github/workflows/release.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1036e72ef8d..00e37c3e638 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -664,9 +664,12 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) - test -n "$ZE_LOADER_DLL" - echo "Using Level Zero loader: $ZE_LOADER_DLL" - cp "$ZE_LOADER_DLL" ./build/bin + if [ -n "$ZE_LOADER_DLL" ]; then + echo "Using Level Zero loader: $ZE_LOADER_DLL" + cp "$ZE_LOADER_DLL" ./build/bin + else + echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime" + fi cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin