Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ option(GGML_RPC "ggml: use RPC"
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
Expand Down
23 changes: 23 additions & 0 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ detect_and_find_package(IntelSYCL)
if (IntelSYCL_FOUND)
# Use oneAPI CMake when possible
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
# Add compatibility shim for work_group_static.hpp if not present in this oneAPI version
get_target_property(_sycl_inc IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES)
if(NOT EXISTS "${_sycl_inc}/sycl/ext/oneapi/work_group_static.hpp")
message(STATUS "work_group_static.hpp not found in oneAPI, using compatibility shim")
target_include_directories(ggml-sycl BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/sycl_compat")
endif()
else()
# Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
target_compile_options(ggml-sycl PRIVATE "-fsycl")
Expand All @@ -93,6 +99,23 @@ endif()

target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

# Level Zero: direct device memory allocation bypasses xe/TTM staging overhead
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h
HINTS ${ONEAPI_ROOT}/include /usr/include)
find_library(ZE_LOADER_LIB ze_loader
HINTS ${ONEAPI_ROOT}/lib /usr/lib /usr/lib64 ENV LD_LIBRARY_PATH)
if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
message(STATUS "Level Zero loader: ${ZE_LOADER_LIB}")
message(STATUS "Level Zero headers: ${LEVEL_ZERO_INCLUDE_DIR}")
else()
message(WARNING "Level Zero not found — L0 memory path disabled (loader=${ZE_LOADER_LIB}, inc=${LEVEL_ZERO_INCLUDE_DIR})")
endif()
endif()

# Link against oneDNN
set(GGML_SYCL_DNNL 0)
if(GGML_SYCL_DNN)
Expand Down
54 changes: 52 additions & 2 deletions ggml/src/ggml-sycl/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

#include "common.hpp"

#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
#include <sycl/backend.hpp>
#include <level_zero/ze_api.h>
#endif

#include "ggml-backend-impl.h"
#include "ggml-impl.h"

Expand Down Expand Up @@ -66,6 +71,52 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
return sycl_down_blk_size;
}

// ggml_sycl_malloc_device: allocates device memory via Level Zero (if available)
// to avoid the xe driver's TTM staging path triggered by sycl::malloc_device.
// sycl::malloc_device creates a 1:1 host mirror of every VRAM allocation via
// xe_gem_prime_export; zeMemAllocDevice uses SVM/P2P path with ~8 MiB overhead.
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
if (g_ggml_sycl_enable_level_zero) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());

ze_relaxed_allocation_limits_exp_desc_t relaxed = {
ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
nullptr,
ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE
};
ze_device_mem_alloc_desc_t alloc_desc = {
ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
&relaxed,
0, 0
};
void *ptr = nullptr;
ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
if (r == ZE_RESULT_SUCCESS && ptr) {
return ptr;
}
GGML_LOG_WARN("zeMemAllocDevice failed (0x%x), falling back to sycl::malloc_device\n", r);
}
#endif
return sycl::malloc_device(size, q);
}

void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
if (!ptr) return;
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
if (g_ggml_sycl_enable_level_zero) {
sycl::usm::alloc type = sycl::get_pointer_type(ptr, q.get_context());
if (type == sycl::usm::alloc::unknown) {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
zeMemFree(ze_ctx, ptr);
return;
}
}
#endif
sycl::free(ptr, q);
}

void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
Expand All @@ -75,8 +126,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
}
if (extra->data_device[i] != nullptr && streams.size()>0) {
ggml_sycl_set_device(i);
SYCL_CHECK(
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i]))));
}
}
delete extra;
Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,15 @@ struct ggml_tensor_extra_gpu {
optimize_feature optimized_feature;
};

// Level Zero direct device memory helpers.
// ggml_sycl_malloc_device uses zeMemAllocDevice (bypasses TTM staging in xe driver).
// ggml_sycl_free_device matches the allocator used by ggml_sycl_malloc_device.
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
extern int g_ggml_sycl_enable_level_zero;
#endif
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
void ggml_sycl_free_device(void *ptr, sycl::queue &q);

void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});

namespace sycl_ex = sycl::ext::oneapi::experimental;
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-sycl/dpct/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,13 +684,13 @@ namespace dpct
"use total memory as free memory";
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
if (!has(sycl::aspect::ext_intel_free_memory)) {
std::cerr << warning_info << std::endl;
free_memory = total_memory;
} else {
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
}
#else
std::cerr << warning_info << std::endl;
static bool warned = false;
if (!warned) { std::cerr << warning_info << std::endl; warned = true; }
free_memory = total_memory;
#if defined(_MSC_VER) && !defined(__clang__)
#pragma message("Querying the number of bytes of free memory is not supported")
Expand Down
Loading