diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index b9f7deb150d..aa961acfda0 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -248,6 +248,7 @@ option(GGML_RPC "ggml: use RPC" option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) +option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON) option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON) option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 8e589fa238d..31b0ecd22ef 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -85,6 +85,12 @@ detect_and_find_package(IntelSYCL) if (IntelSYCL_FOUND) # Use oneAPI CMake when possible target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX) + # Add compatibility shim for work_group_static.hpp if not present in this oneAPI version + get_target_property(_sycl_inc IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES) + if(NOT EXISTS "${_sycl_inc}/sycl/ext/oneapi/work_group_static.hpp") + message(STATUS "work_group_static.hpp not found in oneAPI, using compatibility shim") + target_include_directories(ggml-sycl BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/sycl_compat") + endif() else() # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance target_compile_options(ggml-sycl PRIVATE "-fsycl") @@ -93,6 +99,23 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") +# Level Zero: direct device memory allocation bypasses xe/TTM staging overhead +message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}") +if (GGML_SYCL_SUPPORT_LEVEL_ZERO) + find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h + HINTS ${ONEAPI_ROOT}/include /usr/include) + find_library(ZE_LOADER_LIB ze_loader + HINTS ${ONEAPI_ROOT}/lib /usr/lib /usr/lib64 ENV LD_LIBRARY_PATH) + if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR) + target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO) + message(STATUS "Level Zero loader: ${ZE_LOADER_LIB}") + message(STATUS "Level Zero headers: ${LEVEL_ZERO_INCLUDE_DIR}") + else() + message(WARNING "Level Zero not found — L0 memory path disabled (loader=${ZE_LOADER_LIB}, inc=${LEVEL_ZERO_INCLUDE_DIR})") + endif() +endif() + # Link against oneDNN set(GGML_SYCL_DNNL 0) if(GGML_SYCL_DNN) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 05fd5ef46c7..892b098937d 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -12,6 +12,11 @@ #include "common.hpp" +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#include +#include +#endif + #include "ggml-backend-impl.h" #include "ggml-impl.h" @@ -66,6 +71,52 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } +// ggml_sycl_malloc_device: allocates device memory via Level Zero (if available) +// to avoid the xe driver's TTM staging path triggered by sycl::malloc_device. +// sycl::malloc_device creates a 1:1 host mirror of every VRAM allocation via +// xe_gem_prime_export; zeMemAllocDevice uses SVM/P2P path with ~8 MiB overhead. +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + if (g_ggml_sycl_enable_level_zero) { + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + + ze_relaxed_allocation_limits_exp_desc_t relaxed = { + ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, + nullptr, + ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE + }; + ze_device_mem_alloc_desc_t alloc_desc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + &relaxed, + 0, 0 + }; + void *ptr = nullptr; + ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr); + if (r == ZE_RESULT_SUCCESS && ptr) { + return ptr; + } + GGML_LOG_WARN("zeMemAllocDevice failed (0x%x), falling back to sycl::malloc_device\n", r); + } +#endif + return sycl::malloc_device(size, q); +} + +void ggml_sycl_free_device(void *ptr, sycl::queue &q) { + if (!ptr) return; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + if (g_ggml_sycl_enable_level_zero) { + sycl::usm::alloc type = sycl::get_pointer_type(ptr, q.get_context()); + if (type == sycl::usm::alloc::unknown) { + auto ze_ctx = sycl::get_native(q.get_context()); + zeMemFree(ze_ctx, ptr); + return; + } + } +#endif + sycl::free(ptr, q); +} + void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { for (int i = 0; i < ggml_sycl_info().device_count; ++i) { for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { @@ -75,8 +126,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector str } if (extra->data_device[i] != nullptr && streams.size()>0) { ggml_sycl_set_device(i); - SYCL_CHECK( - CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i])))); } } delete extra; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 5abf2290651..455524c5486 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -309,6 +309,15 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; +// Level Zero direct device memory helpers. +// ggml_sycl_malloc_device uses zeMemAllocDevice (bypasses TTM staging in xe driver). +// ggml_sycl_free_device matches the allocator used by ggml_sycl_malloc_device. +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +extern int g_ggml_sycl_enable_level_zero; +#endif +void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); +void ggml_sycl_free_device(void *ptr, sycl::queue &q); + void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); namespace sycl_ex = sycl::ext::oneapi::experimental; diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 791d3cac52e..f403ee8c0af 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -684,13 +684,13 @@ namespace dpct "use total memory as free memory"; #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) if (!has(sycl::aspect::ext_intel_free_memory)) { - std::cerr << warning_info << std::endl; free_memory = total_memory; } else { free_memory = get_info(); } #else - std::cerr << warning_info << std::endl; + static bool warned = false; + if (!warned) { std::cerr << warning_info << std::endl; warned = true; } free_memory = total_memory; #if defined(_MSC_VER) && !defined(__clang__) #pragma message("Querying the number of bytes of free memory is not supported") diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 1eead625e76..e5e43b6727a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -30,11 +30,30 @@ #include #include +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +# include +# include +# ifdef __linux__ +# include +# include +# include +# include +# include +// memfd_create is in sys/mman.h on glibc ≥ 2.27; define MFD_ALLOW_SEALING if missing +# ifndef MFD_ALLOW_SEALING +# define MFD_ALLOW_SEALING 2U +# endif +# endif +#endif #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC # include #endif #include +#if defined(__x86_64__) || defined(_M_X64) +# include +#endif + #include "ggml.h" #include "ggml-sycl.h" #include "ggml-impl.h" @@ -64,6 +83,10 @@ int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_enable_flash_attention = 1; +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +int g_ggml_sycl_enable_level_zero = 1; // initialized fully in ggml_check_sycl +int g_ggml_sycl_enable_zero_copy = 0; // initialized fully in ggml_check_sycl +#endif static ggml_sycl_device_info ggml_sycl_init() { @@ -114,6 +137,15 @@ static ggml_sycl_device_info ggml_sycl_init() { return info; } +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) +static void ggml_sycl_system_barrier(sycl::queue & q) { + // Level Zero System Barrier ensures visibility between CPU and GPU on UMA + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + zeContextSystemBarrier(ze_ctx, ze_dev); +} +#endif + const ggml_sycl_device_info & ggml_sycl_info() { static ggml_sycl_device_info info = ggml_sycl_init(); return info; @@ -218,6 +250,21 @@ static void ggml_check_sycl() try { g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); + g_ggml_sycl_enable_zero_copy = get_sycl_env("GGML_SYCL_ENABLE_ZERO_COPY", 0); + if (g_ggml_sycl_enable_level_zero) { + // Only enable if all devices actually use the Level Zero backend + for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) { + auto &q = dpct::dev_mgr::instance().get_device(i).default_queue(); + if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) { + GGML_LOG_WARN("SYCL device %d is not Level Zero — disabling L0 memory path\n", i); + g_ggml_sycl_enable_level_zero = 0; + break; + } + } + } +#endif #ifdef SYCL_FLASH_ATTN g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); @@ -263,6 +310,9 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO + GGML_LOG_INFO(" GGML_SYCL_ENABLE_ZERO_COPY: %d\n", g_ggml_sycl_enable_zero_copy); +#endif #ifdef SYCL_FLASH_ATTN GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention); @@ -356,6 +406,14 @@ struct ggml_backend_sycl_buffer_context { optimize_feature opt_feature; std::vector tensor_extras; + // Zero-copy DMA-BUF fields (Lunar Lake UMA only). + // Physical pages are shared: CPU writes via memfd_mmap_ptr, GPU reads via dev_ptr. + bool is_zero_copy = false; + int memfd = -1; // memfd backing the DMA-BUF + int dma_fd = -1; // udmabuf dma_buf file descriptor + void *memfd_mmap_ptr = nullptr; // CPU mmap of memfd for direct writes in set_tensor + size_t dma_size = 0; + ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : device(device), dev_ptr(dev_ptr), stream(stream) { check_allow_gpu_index(device); @@ -366,14 +424,27 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) + if (is_zero_copy) { + // Free the L0 device mapping of the DMA-BUF pages + auto ze_ctx = sycl::get_native(stream->get_context()); + zeMemFree(ze_ctx, dev_ptr); + // Release the CPU mmap — physical pages still held by dma_fd + if (memfd_mmap_ptr) munmap(memfd_mmap_ptr, dma_size); + if (dma_fd >= 0) close(dma_fd); + if (memfd >= 0) close(memfd); + } else { + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); + } +#else + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); +#endif } //release extra used by tensors for (ggml_tensor_extra_gpu * extra : tensor_extras) { release_extra_gpu(extra); } - } }; @@ -457,12 +528,37 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) + if (ctx->is_zero_copy) { + // Zero-copy path: CPU and GPU share the same physical pages via DMA-BUF. + size_t tensor_buf_offset = (size_t)((char *)tensor->data - (char *)ctx->dev_ptr); + void *dst_ptr = (char *)ctx->memfd_mmap_ptr + tensor_buf_offset + offset; + memcpy(dst_ptr, data, size); + + // Ensure iGPU sees CPU writes (essential for UMA coherence) + ggml_sycl_system_barrier(*(ctx->stream)); + return; + } +#endif + ggml_sycl_set_device(ctx->device); auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); + + bool is_integrated = stream->get_device().get_info(); + if (is_integrated) { + // Shared memory (malloc_shared) path for integrated GPUs. + memcpy((char *)tensor->data + offset, data, size); +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) + ggml_sycl_system_barrier(*stream); +#endif + return; + } + SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); #ifndef _WIN32 - // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU. - // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here. + // Note: Use host buffer to save the data from mmap(), then copy to device. + // Workaround for mmap() issue on PVC GPU. char * host_buf = (char *) malloc(size); memcpy(host_buf, data, size); SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait())); @@ -661,6 +757,176 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t return ctx->name.c_str(); } +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) +// --------------------------------------------------------------------------- +// Zero-copy helpers for Lunar Lake UMA +// +// Strategy: +// 1. memfd_create — anonymous RAM-backed file (no disk I/O) +// 2. ftruncate — reserve the size without touching pages yet +// 3. mmap — CPU-accessible view for set_tensor writes +// 4. F_SEAL_SHRINK/GROW — required by udmabuf; does NOT block CPU writes +// 5. udmabuf ioctl — kernel pins the pages as a dma_buf (dma_fd) +// 6. zeMemAllocDevice via ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF +// — maps the same pages into the GPU's page tables (no TTM staging) +// +// Result: CPU writes via memfd_mmap_ptr, GPU reads via the returned USM ptr. +// --------------------------------------------------------------------------- + +// Try to raise /sys/module/udmabuf/parameters/size_limit_mb to cover `size`. +// Returns the current limit in bytes (after the attempted raise). +static size_t udmabuf_ensure_size_limit(size_t size) { + const char *path = "/sys/module/udmabuf/parameters/size_limit_mb"; + size_t limit_bytes = (size_t)64 << 20; // conservative default: 64 MiB + + FILE *f = fopen(path, "r"); + if (f) { + unsigned long cur_mb = 0; + if (fscanf(f, "%lu", &cur_mb) == 1) limit_bytes = cur_mb << 20; + fclose(f); + } + + if (size <= limit_bytes) return limit_bytes; // already sufficient + + // Try to raise the limit (requires root / CAP_SYS_ADMIN) + unsigned long need_mb = (unsigned long)((size + ((1ul << 20) - 1)) >> 20); + f = fopen(path, "w"); + if (f) { + fprintf(f, "%lu", need_mb); + fclose(f); + // Re-read to confirm + f = fopen(path, "r"); + if (f) { + unsigned long new_mb = 0; + if (fscanf(f, "%lu", &new_mb) == 1) limit_bytes = new_mb << 20; + fclose(f); + } + } + + if (size > limit_bytes) { + GGML_LOG_WARN( + "[ZC] udmabuf size_limit_mb=%lu MiB < required %lu MiB.\n" + "[ZC] To enable zero-copy for buffers > 64 MiB, run:\n" + "[ZC] echo 8192 | sudo tee /sys/module/udmabuf/parameters/size_limit_mb\n" + "[ZC] For persistence add to /etc/tmpfiles.d/udmabuf.conf:\n" + "[ZC] w /sys/module/udmabuf/parameters/size_limit_mb - - - - 8192\n", + (unsigned long)(limit_bytes >> 20), (unsigned long)(size >> 20)); + } + return limit_bytes; +} + +// Step 1-5: turn a size into a pinned (memfd, dma_fd, mmap_ptr) triple. +// Returns dma_fd >= 0 on success; caller owns all three on success. +static int ggml_sycl_create_udmabuf(size_t size, int *out_memfd, void **out_mmap_ptr) { + *out_memfd = -1; + *out_mmap_ptr = nullptr; + + // Check (and try to raise) the kernel's udmabuf size limit + size_t limit = udmabuf_ensure_size_limit(size); + if (size > limit) { + // Can't raise — caller will fall through to sycl::malloc_shared + return -1; + } + + // Create anonymous RAM-backed file + int mfd = memfd_create("gguf_weights_zc", MFD_ALLOW_SEALING); + if (mfd < 0) { + GGML_LOG_WARN("[ZC] memfd_create failed: %s\n", strerror(errno)); + return -1; + } + + if (ftruncate(mfd, (off_t)size) < 0) { + GGML_LOG_WARN("[ZC] ftruncate failed: %s\n", strerror(errno)); + close(mfd); + return -1; + } + + // Map for CPU writes during set_tensor; pages are demand-paged + void *mptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); + if (mptr == MAP_FAILED) { + GGML_LOG_WARN("[ZC] mmap memfd failed: %s\n", strerror(errno)); + close(mfd); + return -1; + } + + // Seal: prevent size changes (required by udmabuf); F_SEAL_WRITE intentionally + // NOT set — we still need to write through the mmap above. + if (fcntl(mfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW) < 0) { + GGML_LOG_WARN("[ZC] F_ADD_SEALS failed: %s\n", strerror(errno)); + munmap(mptr, size); + close(mfd); + return -1; + } + + int ufd = open("/dev/udmabuf", O_RDWR); + if (ufd < 0) { + GGML_LOG_WARN("[ZC] /dev/udmabuf open failed: %s\n", strerror(errno)); + munmap(mptr, size); + close(mfd); + return -1; + } + + struct udmabuf_create req = {}; + req.memfd = (uint32_t)mfd; + req.flags = UDMABUF_FLAGS_CLOEXEC; + req.offset = 0; + req.size = size; + + int dma_fd = ioctl(ufd, UDMABUF_CREATE, &req); + close(ufd); + + if (dma_fd < 0) { + GGML_LOG_WARN("[ZC] UDMABUF_CREATE failed for %zu MiB: %s\n", size >> 20, strerror(errno)); + munmap(mptr, size); + close(mfd); + return -1; + } + + GGML_LOG_INFO("[ZC] zero-copy udmabuf created: %zu MiB, dma_fd=%d\n", size >> 20, dma_fd); + *out_memfd = mfd; + *out_mmap_ptr = mptr; + return dma_fd; +} + +// Step 6: import a dma_buf FD into Level Zero as a device USM pointer. +// The physical pages are already pinned by the kernel (via udmabuf). +// On Lunar Lake UMA there is no real device/host distinction — this maps +// the same LPDDR5X pages into the GPU's MMU without any TTM staging. +static void *ggml_sycl_import_dma_buf(sycl::queue &q, int dma_fd, size_t size) { + auto ze_ctx = sycl::get_native(q.get_context()); + auto ze_dev = sycl::get_native(q.get_device()); + + ze_external_memory_import_fd_t import_fd = { + ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD, + nullptr, + ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF, + dma_fd + }; + + // Allow allocations >4 GiB (necessary for large models) + ze_relaxed_allocation_limits_exp_desc_t relaxed = { + ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, + &import_fd, + ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE + }; + + ze_device_mem_alloc_desc_t alloc_desc = { + ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + &relaxed, + 0, 0 + }; + + void *ptr = nullptr; + ze_result_t res = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 4096, ze_dev, &ptr); + if (res != ZE_RESULT_SUCCESS || !ptr) { + GGML_LOG_WARN("[ZC] zeMemAllocDevice DMA_BUF import failed: 0x%x\n", (unsigned)res); + return nullptr; + } + GGML_LOG_DEBUG("[ZC] DMA-BUF imported as USM ptr %p (size %zu)\n", ptr, size); + return ptr; +} +#endif // GGML_SYCL_SUPPORT_LEVEL_ZERO && __linux__ + static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { @@ -669,14 +935,51 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, const queue_ptr stream = buft_ctx->stream; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 + bool is_integrated = stream->get_device().get_info(); + +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__) + // Zero-copy path for Lunar Lake UMA: import memory via DMA-BUF instead of + // allocating a separate device buffer. This eliminates the TTM staging mirror + // AND the double memcpy in set_tensor (CPU host_buf + sycl::memcpy). + if (is_integrated && g_ggml_sycl_enable_zero_copy) { + int out_memfd = -1; + void *mmap_ptr = nullptr; + int dma_fd = ggml_sycl_create_udmabuf(size, &out_memfd, &mmap_ptr); + if (dma_fd >= 0) { + void *dev_ptr = ggml_sycl_import_dma_buf(*stream, dma_fd, size); + if (dev_ptr) { + GGML_LOG_INFO("[ZC] zero-copy buffer %zu MiB via DMA-BUF\n", size >> 20); + ggml_backend_sycl_buffer_context *ctx = + new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + ctx->is_zero_copy = true; + ctx->memfd = out_memfd; + ctx->dma_fd = dma_fd; + ctx->memfd_mmap_ptr = mmap_ptr; + ctx->dma_size = size; + return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); + } + // Import failed — fall through to standard allocation + munmap(mmap_ptr, size); + close(dma_fd); + close(out_memfd); + GGML_LOG_WARN("[ZC] DMA-BUF import failed, falling back to sycl::malloc_shared\n"); + } + } +#endif + void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( - size, *stream))); + if (is_integrated) { + // For iGPU without zero-copy: sycl::malloc_shared avoids discrete-GPU memcpy overhead + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc(size, *stream, sycl::usm::alloc::shared))); + } else { + // For dGPU: zeMemAllocDevice bypasses TTM staging (PR #21597 approach) + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); + } if (!dev_ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); + GGML_LOG_ERROR("%s: can't allocate %zu Bytes of memory on device\n", __func__, size); return nullptr; } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { @@ -922,8 +1225,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( - size, *stream))); + bool is_integrated = stream->get_device().get_info(); + auto alloc_kind = is_integrated ? sycl::usm::alloc::shared : sycl::usm::alloc::device; + SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc(size, *stream, alloc_kind))); if (!buf) { char err_buf[1024]; snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8126249e143..e2cc1144e29 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3493,6 +3493,141 @@ void llama_perf_context_reset(llama_context * ctx) { ctx->perf_reset(); } +void llama_memory_breakdown_print(const struct llama_context * ctx) { + const auto & devices = ctx->get_model().devices; + + std::map memory_breakdown = ctx->memory_breakdown(); + + std::vector> table_data; + table_data.reserve(devices.size()); + const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n"; + const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n"; + const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n"; + + table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"}); + + constexpr size_t MiB = 1024 * 1024; + const std::vector desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "}; + + // track seen buffer types to avoid double counting: + std::set seen_buffer_types; + + // accumulative memory breakdown for each device and for host: + std::vector mb_dev(devices.size()); + llama_memory_breakdown_data mb_host; + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (ggml_backend_buft_is_host(buft)) { + mb_host.model += mb.model; + mb_host.context += mb.context; + mb_host.compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (dev) { + int i_dev = -1; + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i].dev == dev) { + i_dev = i; + break; + } + } + if (i_dev != -1) { + mb_dev[i_dev].model += mb.model; + mb_dev[i_dev].context += mb.context; + mb_dev[i_dev].compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + } + } + + // print memory breakdown for each device: + for (size_t i = 0; i < devices.size(); i++) { + ggml_backend_dev_t dev = devices[i].dev; + llama_memory_breakdown_data mb = mb_dev[i]; + + const std::string name = ggml_backend_dev_name(dev); + std::string desc = ggml_backend_dev_description(dev); + for (const std::string & prefix : desc_prefixes_strip) { + if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) { + desc = desc.substr(prefix.length()); + } + } + + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + + const size_t self = mb.model + mb.context + mb.compute; + const double unaccounted = (double)total - (double)self - (double)free; + + table_data.push_back({ + template_gpu, + " - " + name + " (" + desc + ")", + std::to_string(total / MiB), + std::to_string(free / MiB), + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + std::to_string((int64_t)unaccounted / (int64_t)MiB)}); + } + + // print memory breakdown for host: + { + const size_t self = mb_host.model + mb_host.context + mb_host.compute; + table_data.push_back({ + template_other, + " - Host", + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb_host.model / MiB), + std::to_string(mb_host.context / MiB), + std::to_string(mb_host.compute / MiB), + ""}); // unaccounted + } + + // print memory breakdown for all remaining buffer types: + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (seen_buffer_types.count(buft) == 1) { + continue; + } + const std::string name = ggml_backend_buft_name(buft); + const size_t self = mb.model + mb.context + mb.compute; + table_data.push_back({ + template_other, + " - " + name, + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + ""}); // unaccounted + seen_buffer_types.insert(buft); + } + + for (size_t j = 1; j < table_data[0].size(); j++) { + size_t max_len = 0; + for (const auto & td : table_data) { + max_len = std::max(max_len, td[j].length()); + } + for (auto & td : table_data) { + td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' '); + } + } + for (const auto & td : table_data) { + LLAMA_LOG_INFO(td[0].c_str(), + __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(), + td[6].c_str(), td[7].c_str(), td[8].c_str()); + } +} // // training //