ggml-org · i-Charlys · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 27, 2026
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -248,6 +248,7 @@ option(GGML_RPC                             "ggml: use RPC"
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"        ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING

@@ -85,6 +85,12 @@ detect_and_find_package(IntelSYCL)
 if (IntelSYCL_FOUND)
     # Use oneAPI CMake when possible
     target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
+    # Add compatibility shim for work_group_static.hpp if not present in this oneAPI version
+    get_target_property(_sycl_inc IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES)
+    if(NOT EXISTS "${_sycl_inc}/sycl/ext/oneapi/work_group_static.hpp")
+        message(STATUS "work_group_static.hpp not found in oneAPI, using compatibility shim")
+        target_include_directories(ggml-sycl BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/sycl_compat")
+    endif()
 else()
     # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
     target_compile_options(ggml-sycl PRIVATE "-fsycl")
@@ -93,6 +99,23 @@ endif()
 
 target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
 
+# Level Zero: direct device memory allocation bypasses xe/TTM staging overhead
+message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
+if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
+    find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h
+              HINTS ${ONEAPI_ROOT}/include /usr/include)
+    find_library(ZE_LOADER_LIB ze_loader
+                 HINTS ${ONEAPI_ROOT}/lib /usr/lib /usr/lib64 ENV LD_LIBRARY_PATH)
+    if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
+        target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
+        message(STATUS "Level Zero loader: ${ZE_LOADER_LIB}")
+        message(STATUS "Level Zero headers: ${LEVEL_ZERO_INCLUDE_DIR}")
+    else()
+        message(WARNING "Level Zero not found — L0 memory path disabled (loader=${ZE_LOADER_LIB}, inc=${LEVEL_ZERO_INCLUDE_DIR})")
+    endif()
+endif()
+
 # Link against oneDNN
 set(GGML_SYCL_DNNL 0)
 if(GGML_SYCL_DNN)

@@ -12,6 +12,11 @@
 
 #include "common.hpp"
 
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+#include <sycl/backend.hpp>
+#include <level_zero/ze_api.h>
+#endif
+
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 
@@ -66,6 +71,52 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
   return sycl_down_blk_size;
 }
 
+// ggml_sycl_malloc_device: allocates device memory via Level Zero (if available)
+// to avoid the xe driver's TTM staging path triggered by sycl::malloc_device.
+// sycl::malloc_device creates a 1:1 host mirror of every VRAM allocation via
+// xe_gem_prime_export; zeMemAllocDevice uses SVM/P2P path with ~8 MiB overhead.
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+    if (g_ggml_sycl_enable_level_zero) {
+        auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+        auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
+
+        ze_relaxed_allocation_limits_exp_desc_t relaxed = {
+            ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
+            nullptr,
+            ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE
+        };
+        ze_device_mem_alloc_desc_t alloc_desc = {
+            ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+            &relaxed,
+            0, 0
+        };
+        void *ptr = nullptr;
+        ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
+        if (r == ZE_RESULT_SUCCESS && ptr) {
+            return ptr;
+        }
+        GGML_LOG_WARN("zeMemAllocDevice failed (0x%x), falling back to sycl::malloc_device\n", r);
+    }
+#endif
+    return sycl::malloc_device(size, q);
+}
+
+void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
+    if (!ptr) return;
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+    if (g_ggml_sycl_enable_level_zero) {
+        sycl::usm::alloc type = sycl::get_pointer_type(ptr, q.get_context());
+        if (type == sycl::usm::alloc::unknown) {
+            auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
+            zeMemFree(ze_ctx, ptr);
+            return;
+        }
+    }
+#endif
+    sycl::free(ptr, q);
+}
+
 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
@@ -75,8 +126,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
         }
         if (extra->data_device[i] != nullptr && streams.size()>0) {
             ggml_sycl_set_device(i);
-            SYCL_CHECK(
-                CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+            SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i]))));
         }
     }
     delete extra;

@@ -309,6 +309,15 @@ struct ggml_tensor_extra_gpu {
   optimize_feature optimized_feature;
 };
 
+// Level Zero direct device memory helpers.
+// ggml_sycl_malloc_device uses zeMemAllocDevice (bypasses TTM staging in xe driver).
+// ggml_sycl_free_device matches the allocator used by ggml_sycl_malloc_device.
+#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
+extern int g_ggml_sycl_enable_level_zero;
+#endif
+void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
+void   ggml_sycl_free_device(void *ptr, sycl::queue &q);
+
 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
 
 namespace sycl_ex = sycl::ext::oneapi::experimental;

@@ -684,13 +684,13 @@ namespace dpct
             "use total memory as free memory";
 #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
         if (!has(sycl::aspect::ext_intel_free_memory)) {
-          std::cerr << warning_info << std::endl;
           free_memory = total_memory;
         } else {
           free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
         }
 #else
-        std::cerr << warning_info << std::endl;
+        static bool warned = false;
+        if (!warned) { std::cerr << warning_info << std::endl; warned = true; }
         free_memory = total_memory;
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma message("Querying the number of bytes of free memory is not supported")