diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index b9f7deb150d..aa961acfda0 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -248,6 +248,7 @@ option(GGML_RPC                             "ggml: use RPC"
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"        ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 8e589fa238d..31b0ecd22ef 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -85,6 +85,12 @@ detect_and_find_package(IntelSYCL)
 if (IntelSYCL_FOUND)
     # Use oneAPI CMake when possible
     target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
+    # Add compatibility shim for work_group_static.hpp if not present in this oneAPI version
+    get_target_property(_sycl_inc IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES)
+    if(NOT EXISTS "${_sycl_inc}/sycl/ext/oneapi/work_group_static.hpp")
+        message(STATUS "work_group_static.hpp not found in oneAPI, using compatibility shim")
+        target_include_directories(ggml-sycl BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/sycl_compat")
+    endif()
 else()
     # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
     target_compile_options(ggml-sycl PRIVATE "-fsycl")
@@ -93,6 +99,23 @@ endif()
 
 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
 
+# Level Zero: direct device memory allocation bypasses xe/TTM staging overhead
+message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
+if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
+    find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h
+              HINTS ${ONEAPI_ROOT}/include /usr/include)
+    find_library(ZE_LOADER_LIB ze_loader
+                 HINTS ${ONEAPI_ROOT}/lib /usr/lib /usr/lib64 ENV LD_LIBRARY_PATH)
+    if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
+        target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
+        message(STATUS "Level Zero loader: ${ZE_LOADER_LIB}")
+        message(STATUS "Level Zero headers: ${LEVEL_ZERO_INCLUDE_DIR}")
+    else()
+        message(WARNING "Level Zero not found — L0 memory path disabled (loader=${ZE_LOADER_LIB}, inc=${LEVEL_ZERO_INCLUDE_DIR})")
+    endif()
+endif()
+
 # Link against oneDNN
 set(GGML_SYCL_DNNL 0)
 if(GGML_SYCL_DNN)
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index 05fd5ef46c7..892b098937d 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -12,6 +12,11 @@
 
 #include "common.hpp"
 
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#include <sycl/backend.hpp>
+#include <level_zero/ze_api.h>
+#endif
+
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 
@@ -66,6 +71,52 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
   return sycl_down_blk_size;
 }
 
+// ggml_sycl_malloc_device: allocates device memory via Level Zero (if available)
+// to avoid the xe driver's TTM staging path triggered by sycl::malloc_device.
+// sycl::malloc_device creates a 1:1 host mirror of every VRAM allocation via
+// xe_gem_prime_export; zeMemAllocDevice uses SVM/P2P path with ~8 MiB overhead.
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+    if (g_ggml_sycl_enable_level_zero) {
+        auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+        auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
+
+        ze_relaxed_allocation_limits_exp_desc_t relaxed = {
+            ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
+            nullptr,
+            ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE
+        };
+        ze_device_mem_alloc_desc_t alloc_desc = {
+            ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+            &relaxed,
+            0, 0
+        };
+        void *ptr = nullptr;
+        ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
+        if (r == ZE_RESULT_SUCCESS && ptr) {
+            return ptr;
+        }
+        GGML_LOG_WARN("zeMemAllocDevice failed (0x%x), falling back to sycl::malloc_device\n", r);
+    }
+#endif
+    return sycl::malloc_device(size, q);
+}
+
+void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
+    if (!ptr) return;
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+    if (g_ggml_sycl_enable_level_zero) {
+        sycl::usm::alloc type = sycl::get_pointer_type(ptr, q.get_context());
+        if (type == sycl::usm::alloc::unknown) {
+            auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+            zeMemFree(ze_ctx, ptr);
+            return;
+        }
+    }
+#endif
+    sycl::free(ptr, q);
+}
+
 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
@@ -75,8 +126,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
         }
         if (extra->data_device[i] != nullptr && streams.size()>0) {
             ggml_sycl_set_device(i);
-            SYCL_CHECK(
-                CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+            SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i]))));
         }
     }
     delete extra;
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 5abf2290651..455524c5486 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -309,6 +309,15 @@ struct ggml_tensor_extra_gpu {
   optimize_feature optimized_feature;
 };
 
+// Level Zero direct device memory helpers.
+// ggml_sycl_malloc_device uses zeMemAllocDevice (bypasses TTM staging in xe driver).
+// ggml_sycl_free_device matches the allocator used by ggml_sycl_malloc_device.
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+extern int g_ggml_sycl_enable_level_zero;
+#endif
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
+void   ggml_sycl_free_device(void *ptr, sycl::queue &q);
+
 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
 
 namespace sycl_ex = sycl::ext::oneapi::experimental;
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 791d3cac52e..f403ee8c0af 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -684,13 +684,13 @@ namespace dpct
             "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
         if (!has(sycl::aspect::ext_intel_free_memory)) {
-          std::cerr << warning_info << std::endl;
           free_memory = total_memory;
         } else {
           free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
         }
 #else
-        std::cerr << warning_info << std::endl;
+        static bool warned = false;
+        if (!warned) { std::cerr << warning_info << std::endl; warned = true; }
         free_memory = total_memory;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma message("Querying the number of bytes of free memory is not supported")
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1eead625e76..e5e43b6727a 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -30,11 +30,30 @@
 #include <regex>
 
 #include <sycl/sycl.hpp>
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#  include <sycl/backend.hpp>
+#  include <level_zero/ze_api.h>
+#  ifdef __linux__
+#    include <fcntl.h>
+#    include <sys/ioctl.h>
+#    include <sys/mman.h>
+#    include <linux/udmabuf.h>
+#    include <unistd.h>
+// memfd_create is in sys/mman.h on glibc ≥ 2.27; define MFD_ALLOW_SEALING if missing
+#    ifndef MFD_ALLOW_SEALING
+#      define MFD_ALLOW_SEALING 2U
+#    endif
+#  endif
+#endif
 #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
 #    include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
 #endif
 #include <sycl/half_type.hpp>
 
+#if defined(__x86_64__) || defined(_M_X64)
+#  include <immintrin.h>
+#endif
+
 #include "ggml.h"
 #include "ggml-sycl.h"
 #include "ggml-impl.h"
@@ -64,6 +83,10 @@ int g_ggml_sycl_disable_dnn = 0;
 int g_ggml_sycl_prioritize_dmmv = 0;
 int g_ggml_sycl_use_async_mem_op = 0;
 int g_ggml_sycl_enable_flash_attention = 1;
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+int g_ggml_sycl_enable_level_zero = 1; // initialized fully in ggml_check_sycl
+int g_ggml_sycl_enable_zero_copy = 0;  // initialized fully in ggml_check_sycl
+#endif
 
 
 static ggml_sycl_device_info ggml_sycl_init() {
@@ -114,6 +137,15 @@ static ggml_sycl_device_info ggml_sycl_init() {
     return info;
 }
 
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+static void ggml_sycl_system_barrier(sycl::queue & q) {
+    // Level Zero System Barrier ensures visibility between CPU and GPU on UMA
+    auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+    auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
+    zeContextSystemBarrier(ze_ctx, ze_dev);
+}
+#endif
+
 const ggml_sycl_device_info & ggml_sycl_info() {
     static ggml_sycl_device_info info = ggml_sycl_init();
     return info;
@@ -218,6 +250,21 @@ static void ggml_check_sycl() try {
         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
         g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
         g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+        g_ggml_sycl_enable_zero_copy = get_sycl_env("GGML_SYCL_ENABLE_ZERO_COPY", 0);
+        if (g_ggml_sycl_enable_level_zero) {
+            // Only enable if all devices actually use the Level Zero backend
+            for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
+                auto &q = dpct::dev_mgr::instance().get_device(i).default_queue();
+                if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+                    GGML_LOG_WARN("SYCL device %d is not Level Zero — disabling L0 memory path\n", i);
+                    g_ggml_sycl_enable_level_zero = 0;
+                    break;
+                }
+            }
+        }
+#endif
 
 #ifdef SYCL_FLASH_ATTN
         g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
@@ -263,6 +310,9 @@ static void ggml_check_sycl() try {
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
 #endif
         GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_ZERO_COPY: %d\n", g_ggml_sycl_enable_zero_copy);
+#endif
 
 #ifdef SYCL_FLASH_ATTN
         GGML_LOG_INFO("  GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
@@ -356,6 +406,14 @@ struct ggml_backend_sycl_buffer_context {
     optimize_feature opt_feature;
     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 
+    // Zero-copy DMA-BUF fields (Lunar Lake UMA only).
+    // Physical pages are shared: CPU writes via memfd_mmap_ptr, GPU reads via dev_ptr.
+    bool  is_zero_copy   = false;
+    int   memfd          = -1;   // memfd backing the DMA-BUF
+    int   dma_fd         = -1;   // udmabuf dma_buf file descriptor
+    void *memfd_mmap_ptr = nullptr; // CPU mmap of memfd for direct writes in set_tensor
+    size_t dma_size      = 0;
+
     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
         device(device), dev_ptr(dev_ptr), stream(stream) {
             check_allow_gpu_index(device);
@@ -366,14 +424,27 @@ struct ggml_backend_sycl_buffer_context {
     ~ggml_backend_sycl_buffer_context() {
         if (dev_ptr != nullptr) {
             ggml_sycl_set_device(device);
-            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+            if (is_zero_copy) {
+                // Free the L0 device mapping of the DMA-BUF pages
+                auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(stream->get_context());
+                zeMemFree(ze_ctx, dev_ptr);
+                // Release the CPU mmap — physical pages still held by dma_fd
+                if (memfd_mmap_ptr) munmap(memfd_mmap_ptr, dma_size);
+                if (dma_fd  >= 0) close(dma_fd);
+                if (memfd   >= 0) close(memfd);
+            } else {
+                SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
+            }
+#else
+            SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
+#endif
         }
 
         //release extra used by tensors
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
             release_extra_gpu(extra);
         }
-
     }
 };
 
@@ -457,12 +528,37 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
     GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+    if (ctx->is_zero_copy) {
+        // Zero-copy path: CPU and GPU share the same physical pages via DMA-BUF.
+        size_t tensor_buf_offset = (size_t)((char *)tensor->data - (char *)ctx->dev_ptr);
+        void *dst_ptr = (char *)ctx->memfd_mmap_ptr + tensor_buf_offset + offset;
+        memcpy(dst_ptr, data, size);
+
+        // Ensure iGPU sees CPU writes (essential for UMA coherence)
+        ggml_sycl_system_barrier(*(ctx->stream));
+        return;
+    }
+#endif
+
     ggml_sycl_set_device(ctx->device);
     auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
+
+    bool is_integrated = stream->get_device().get_info<sycl::info::device::host_unified_memory>();
+    if (is_integrated) {
+        // Shared memory (malloc_shared) path for integrated GPUs.
+        memcpy((char *)tensor->data + offset, data, size);
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+        ggml_sycl_system_barrier(*stream);
+#endif
+        return;
+    }
+
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
 #ifndef _WIN32
-    // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
-    // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
+    // Note: Use host buffer to save the data from mmap(), then copy to device.
+    // Workaround for mmap() issue on PVC GPU.
     char * host_buf = (char *) malloc(size);
     memcpy(host_buf, data, size);
     SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
@@ -661,6 +757,176 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
     return ctx->name.c_str();
 }
 
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+// ---------------------------------------------------------------------------
+// Zero-copy helpers for Lunar Lake UMA
+//
+// Strategy:
+//   1. memfd_create — anonymous RAM-backed file (no disk I/O)
+//   2. ftruncate    — reserve the size without touching pages yet
+//   3. mmap         — CPU-accessible view for set_tensor writes
+//   4. F_SEAL_SHRINK/GROW — required by udmabuf; does NOT block CPU writes
+//   5. udmabuf ioctl — kernel pins the pages as a dma_buf (dma_fd)
+//   6. zeMemAllocDevice via ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF
+//      — maps the same pages into the GPU's page tables (no TTM staging)
+//
+// Result: CPU writes via memfd_mmap_ptr, GPU reads via the returned USM ptr.
+// ---------------------------------------------------------------------------
+
+// Try to raise /sys/module/udmabuf/parameters/size_limit_mb to cover `size`.
+// Returns the current limit in bytes (after the attempted raise).
+static size_t udmabuf_ensure_size_limit(size_t size) {
+    const char *path = "/sys/module/udmabuf/parameters/size_limit_mb";
+    size_t limit_bytes = (size_t)64 << 20; // conservative default: 64 MiB
+
+    FILE *f = fopen(path, "r");
+    if (f) {
+        unsigned long cur_mb = 0;
+        if (fscanf(f, "%lu", &cur_mb) == 1) limit_bytes = cur_mb << 20;
+        fclose(f);
+    }
+
+    if (size <= limit_bytes) return limit_bytes; // already sufficient
+
+    // Try to raise the limit (requires root / CAP_SYS_ADMIN)
+    unsigned long need_mb = (unsigned long)((size + ((1ul << 20) - 1)) >> 20);
+    f = fopen(path, "w");
+    if (f) {
+        fprintf(f, "%lu", need_mb);
+        fclose(f);
+        // Re-read to confirm
+        f = fopen(path, "r");
+        if (f) {
+            unsigned long new_mb = 0;
+            if (fscanf(f, "%lu", &new_mb) == 1) limit_bytes = new_mb << 20;
+            fclose(f);
+        }
+    }
+
+    if (size > limit_bytes) {
+        GGML_LOG_WARN(
+            "[ZC] udmabuf size_limit_mb=%lu MiB < required %lu MiB.\n"
+            "[ZC] To enable zero-copy for buffers > 64 MiB, run:\n"
+            "[ZC]   echo 8192 | sudo tee /sys/module/udmabuf/parameters/size_limit_mb\n"
+            "[ZC] For persistence add to /etc/tmpfiles.d/udmabuf.conf:\n"
+            "[ZC]   w /sys/module/udmabuf/parameters/size_limit_mb - - - - 8192\n",
+            (unsigned long)(limit_bytes >> 20), (unsigned long)(size >> 20));
+    }
+    return limit_bytes;
+}
+
+// Step 1-5: turn a size into a pinned (memfd, dma_fd, mmap_ptr) triple.
+// Returns dma_fd >= 0 on success; caller owns all three on success.
+static int ggml_sycl_create_udmabuf(size_t size, int *out_memfd, void **out_mmap_ptr) {
+    *out_memfd    = -1;
+    *out_mmap_ptr = nullptr;
+
+    // Check (and try to raise) the kernel's udmabuf size limit
+    size_t limit = udmabuf_ensure_size_limit(size);
+    if (size > limit) {
+        // Can't raise — caller will fall through to sycl::malloc_shared
+        return -1;
+    }
+
+    // Create anonymous RAM-backed file
+    int mfd = memfd_create("gguf_weights_zc", MFD_ALLOW_SEALING);
+    if (mfd < 0) {
+        GGML_LOG_WARN("[ZC] memfd_create failed: %s\n", strerror(errno));
+        return -1;
+    }
+
+    if (ftruncate(mfd, (off_t)size) < 0) {
+        GGML_LOG_WARN("[ZC] ftruncate failed: %s\n", strerror(errno));
+        close(mfd);
+        return -1;
+    }
+
+    // Map for CPU writes during set_tensor; pages are demand-paged
+    void *mptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+    if (mptr == MAP_FAILED) {
+        GGML_LOG_WARN("[ZC] mmap memfd failed: %s\n", strerror(errno));
+        close(mfd);
+        return -1;
+    }
+
+    // Seal: prevent size changes (required by udmabuf); F_SEAL_WRITE intentionally
+    // NOT set — we still need to write through the mmap above.
+    if (fcntl(mfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW) < 0) {
+        GGML_LOG_WARN("[ZC] F_ADD_SEALS failed: %s\n", strerror(errno));
+        munmap(mptr, size);
+        close(mfd);
+        return -1;
+    }
+
+    int ufd = open("/dev/udmabuf", O_RDWR);
+    if (ufd < 0) {
+        GGML_LOG_WARN("[ZC] /dev/udmabuf open failed: %s\n", strerror(errno));
+        munmap(mptr, size);
+        close(mfd);
+        return -1;
+    }
+
+    struct udmabuf_create req = {};
+    req.memfd  = (uint32_t)mfd;
+    req.flags  = UDMABUF_FLAGS_CLOEXEC;
+    req.offset = 0;
+    req.size   = size;
+
+    int dma_fd = ioctl(ufd, UDMABUF_CREATE, &req);
+    close(ufd);
+
+    if (dma_fd < 0) {
+        GGML_LOG_WARN("[ZC] UDMABUF_CREATE failed for %zu MiB: %s\n", size >> 20, strerror(errno));
+        munmap(mptr, size);
+        close(mfd);
+        return -1;
+    }
+
+    GGML_LOG_INFO("[ZC] zero-copy udmabuf created: %zu MiB, dma_fd=%d\n", size >> 20, dma_fd);
+    *out_memfd    = mfd;
+    *out_mmap_ptr = mptr;
+    return dma_fd;
+}
+
+// Step 6: import a dma_buf FD into Level Zero as a device USM pointer.
+// The physical pages are already pinned by the kernel (via udmabuf).
+// On Lunar Lake UMA there is no real device/host distinction — this maps
+// the same LPDDR5X pages into the GPU's MMU without any TTM staging.
+static void *ggml_sycl_import_dma_buf(sycl::queue &q, int dma_fd, size_t size) {
+    auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+    auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
+
+    ze_external_memory_import_fd_t import_fd = {
+        ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD,
+        nullptr,
+        ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF,
+        dma_fd
+    };
+
+    // Allow allocations >4 GiB (necessary for large models)
+    ze_relaxed_allocation_limits_exp_desc_t relaxed = {
+        ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
+        &import_fd,
+        ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE
+    };
+
+    ze_device_mem_alloc_desc_t alloc_desc = {
+        ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+        &relaxed,
+        0, 0
+    };
+
+    void *ptr = nullptr;
+    ze_result_t res = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 4096, ze_dev, &ptr);
+    if (res != ZE_RESULT_SUCCESS || !ptr) {
+        GGML_LOG_WARN("[ZC] zeMemAllocDevice DMA_BUF import failed: 0x%x\n", (unsigned)res);
+        return nullptr;
+    }
+    GGML_LOG_DEBUG("[ZC] DMA-BUF imported as USM ptr %p (size %zu)\n", ptr, size);
+    return ptr;
+}
+#endif // GGML_SYCL_SUPPORT_LEVEL_ZERO && __linux__
+
 static ggml_backend_buffer_t
 ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                            size_t size) try {
@@ -669,14 +935,51 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     const queue_ptr stream = buft_ctx->stream;
     size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
 
+    bool is_integrated = stream->get_device().get_info<sycl::info::device::host_unified_memory>();
+
+#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) && defined(__linux__)
+    // Zero-copy path for Lunar Lake UMA: import memory via DMA-BUF instead of
+    // allocating a separate device buffer. This eliminates the TTM staging mirror
+    // AND the double memcpy in set_tensor (CPU host_buf + sycl::memcpy).
+    if (is_integrated && g_ggml_sycl_enable_zero_copy) {
+        int out_memfd = -1;
+        void *mmap_ptr = nullptr;
+        int dma_fd = ggml_sycl_create_udmabuf(size, &out_memfd, &mmap_ptr);
+        if (dma_fd >= 0) {
+            void *dev_ptr = ggml_sycl_import_dma_buf(*stream, dma_fd, size);
+            if (dev_ptr) {
+                GGML_LOG_INFO("[ZC] zero-copy buffer %zu MiB via DMA-BUF\n", size >> 20);
+                ggml_backend_sycl_buffer_context *ctx =
+                    new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
+                ctx->is_zero_copy   = true;
+                ctx->memfd          = out_memfd;
+                ctx->dma_fd         = dma_fd;
+                ctx->memfd_mmap_ptr = mmap_ptr;
+                ctx->dma_size       = size;
+                return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
+            }
+            // Import failed — fall through to standard allocation
+            munmap(mmap_ptr, size);
+            close(dma_fd);
+            close(out_memfd);
+            GGML_LOG_WARN("[ZC] DMA-BUF import failed, falling back to sycl::malloc_shared\n");
+        }
+    }
+#endif
+
     void * dev_ptr;
-    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, *stream)));
+    if (is_integrated) {
+        // For iGPU without zero-copy: sycl::malloc_shared avoids discrete-GPU memcpy overhead
+        SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc(size, *stream, sycl::usm::alloc::shared)));
+    } else {
+        // For dGPU: zeMemAllocDevice bypasses TTM staging (PR #21597 approach)
+        SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
+    }
     if (!dev_ptr) {
-      GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
+      GGML_LOG_ERROR("%s: can't allocate %zu Bytes of memory on device\n", __func__, size);
       return nullptr;
     }
-    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
+    ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
     return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
 }
 catch (sycl::exception const &exc) {
@@ -922,8 +1225,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
         error codes. The original code was commented out and a warning string
         was inserted. You need to rewrite this code.
         */
-        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, *stream)));
+        bool is_integrated = stream->get_device().get_info<sycl::info::device::host_unified_memory>();
+        auto alloc_kind = is_integrated ? sycl::usm::alloc::shared : sycl::usm::alloc::device;
+        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc(size, *stream, alloc_kind)));
         if (!buf) {
             char err_buf[1024];
             snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8126249e143..e2cc1144e29 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3493,6 +3493,141 @@ void llama_perf_context_reset(llama_context * ctx) {
     ctx->perf_reset();
 }
 
+void llama_memory_breakdown_print(const struct llama_context * ctx) {
+    const auto & devices = ctx->get_model().devices;
+
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+
+    std::vector<std::array<std::string, 9>> table_data;
+    table_data.reserve(devices.size());
+    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
+    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
+    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
+
+    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
+
+    constexpr size_t MiB = 1024 * 1024;
+    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
+
+    // track seen buffer types to avoid double counting:
+    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
+
+    // accumulative memory breakdown for each device and for host:
+    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
+    llama_memory_breakdown_data              mb_host;
+
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (ggml_backend_buft_is_host(buft)) {
+            mb_host.model   += mb.model;
+            mb_host.context += mb.context;
+            mb_host.compute += mb.compute;
+            seen_buffer_types.insert(buft);
+            continue;
+        }
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (dev) {
+            int i_dev = -1;
+            for (size_t i = 0; i < devices.size(); i++) {
+                if (devices[i].dev == dev) {
+                    i_dev = i;
+                    break;
+                }
+            }
+            if (i_dev != -1) {
+                mb_dev[i_dev].model   += mb.model;
+                mb_dev[i_dev].context += mb.context;
+                mb_dev[i_dev].compute += mb.compute;
+                seen_buffer_types.insert(buft);
+                continue;
+            }
+        }
+    }
+
+    // print memory breakdown for each device:
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_t          dev = devices[i].dev;
+        llama_memory_breakdown_data mb  = mb_dev[i];
+
+        const std::string name = ggml_backend_dev_name(dev);
+        std::string desc = ggml_backend_dev_description(dev);
+        for (const std::string & prefix : desc_prefixes_strip) {
+            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+                desc = desc.substr(prefix.length());
+            }
+        }
+
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+
+        const size_t self = mb.model + mb.context + mb.compute;
+        const double unaccounted = (double)total - (double)self - (double)free;
+
+        table_data.push_back({
+            template_gpu,
+            "  - " + name + " (" + desc + ")",
+            std::to_string(total / MiB),
+            std::to_string(free / MiB),
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            std::to_string((int64_t)unaccounted / (int64_t)MiB)});
+    }
+
+    // print memory breakdown for host:
+    {
+        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
+        table_data.push_back({
+            template_other,
+            "  - Host",
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb_host.model / MiB),
+            std::to_string(mb_host.context / MiB),
+            std::to_string(mb_host.compute / MiB),
+            ""}); // unaccounted
+    }
+
+    // print memory breakdown for all remaining buffer types:
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (seen_buffer_types.count(buft) == 1) {
+            continue;
+        }
+        const std::string name = ggml_backend_buft_name(buft);
+        const size_t self = mb.model + mb.context + mb.compute;
+        table_data.push_back({
+            template_other,
+            "  - " + name,
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            ""}); // unaccounted
+        seen_buffer_types.insert(buft);
+    }
+
+    for (size_t j = 1; j < table_data[0].size(); j++) {
+        size_t max_len = 0;
+        for (const auto & td : table_data) {
+            max_len = std::max(max_len, td[j].length());
+        }
+        for (auto & td : table_data) {
+            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
+        }
+    }
+    for (const auto & td : table_data) {
+        LLAMA_LOG_INFO(td[0].c_str(),
+            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
+            td[6].c_str(), td[7].c_str(), td[8].c_str());
+    }
+}
 //
 // training
 //