diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index b7a32e8cdb..125952e038 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -140,6 +140,11 @@ if(UR_BUILD_ADAPTER_L0) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_linux.cpp ) + else() + target_sources(ur_adapter_level_zero + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_windows.cpp + ) endif() # TODO: fix level_zero adapter conversion warnings diff --git a/source/adapters/level_zero/adapter_lib_init_windows.cpp b/source/adapters/level_zero/adapter_lib_init_windows.cpp new file mode 100644 index 0000000000..c197f2396e --- /dev/null +++ b/source/adapters/level_zero/adapter_lib_init_windows.cpp @@ -0,0 +1,53 @@ +//===--------- adapter_lib_init_linux.cpp - Level Zero Adapter ------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "adapter.hpp" +#include "ur_level_zero.hpp" + +#include + +// On windows, UR is destructed before sycl-rt have destructed the objects, so +// we need to clear the leftover memory before UR destruction. +ur_result_t deleteCacheOnDestruction() { + std::lock_guard Lock{GlobalAdapter->Mutex}; + const auto *platforms = GlobalAdapter->PlatformCache->get_value(); + for (const auto &p : *platforms) { + std::scoped_lock ContextsLock(p->ContextsMutex); + while (!p->Contexts.empty()) { + ur_context_handle_t &ctx = p->Contexts.front(); + ctx->deleteCachedObjectsOnDestruction(); + uint32_t RefCount = ctx->RefCount.load(); + while (RefCount--) { + UR_CALL(urContextRelease(ctx)); + } + // context object should be deleted at this point, but this is a guard + // call to protect from infinite loop on case of error. + deleteFromCachedList(ctx, p->Contexts); + } + } +} + +BOOL WINAPI DllMain(HINSTANCE hinstDLL, // handle to DLL module + DWORD fdwReason, // reason for calling function + LPVOID lpReserved) // reserved +{ + switch (fdwReason) { + case DLL_PROCESS_ATTACH: + break; + case DLL_PROCESS_DETACH: { + return deleteCacheOnDestruction() == UR_RESULT_SUCCESS; + } + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + break; + } + return TRUE; +} diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index b7d0a4a913..3ac7c6680c 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -450,6 +450,9 @@ extern const bool UseUSMAllocator; // Controls support of the indirect access kernels and deferred memory release. const bool IndirectAccessTrackingEnabled = [] { +#ifdef _WIN32 + return false; +#endif char *UrRet = std::getenv("UR_L0_TRACK_INDIRECT_ACCESS_MEMORY"); char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY"); const bool RetVal = UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0); @@ -530,4 +533,19 @@ extern thread_local int32_t ErrorAdapterNativeCode; ur_result_t ErrorCode, int32_t AdapterErrorCode); +template +void addToCachedList(T &CachedObject, std::list &CachedList) { + auto It = std::find(CachedList.begin(), CachedList.end(), CachedObject); + if (It == CachedList.end()) { + CachedList.push_back(CachedObject); + } +} + +template +void deleteFromCachedList(T &CachedObject, std::list &CachedList) { + auto It = std::find(CachedList.begin(), CachedList.end(), CachedObject); + if (It != CachedList.end()) + CachedList.erase(It); +} + #define L0_DRIVER_INORDER_MIN_VERSION 29534 diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 452189d038..11bad15507 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -40,10 +40,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( Context->initialize(); *RetContext = reinterpret_cast(Context); +#ifdef _WIN32 + std::scoped_lock Lock(Platform->ContextsMutex); + addToCachedList(*RetContext, Platform->Contexts); +#else if (IndirectAccessTrackingEnabled) { std::scoped_lock Lock(Platform->ContextsMutex); Platform->Contexts.push_back(*RetContext); } +#endif } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -158,6 +163,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( ZeContext, NumDevices, Devices, OwnNativeHandle); UrContext->initialize(); *Context = reinterpret_cast(UrContext); +#ifdef _WIN32 + ur_platform_handle_t Platform = Devices[0]->Platform; + std::scoped_lock Lock(Platform->ContextsMutex); + addToCachedList(*Context, Platform->Contexts); +#endif } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -355,13 +365,20 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) { if (!Context->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; - if (IndirectAccessTrackingEnabled) { + auto DeleteFromContextsCache = [&]() { ur_platform_handle_t Plt = Context->getPlatform(); auto &Contexts = Plt->Contexts; auto It = std::find(Contexts.begin(), Contexts.end(), Context); if (It != Contexts.end()) Contexts.erase(It); + }; + + DeleteFromContextsCache(); + + if (IndirectAccessTrackingEnabled) { + DeleteFromContextsCache(); } + ze_context_handle_t DestroyZeContext = Context->OwnNativeHandle ? Context->ZeContext : nullptr; @@ -451,6 +468,7 @@ ur_result_t ur_context_handle_t_::finalize() { } } } + return UR_RESULT_SUCCESS; } @@ -838,3 +856,16 @@ ur_context_handle_t_::getDevices() const { ze_context_handle_t ur_context_handle_t_::getZeHandle() const { return ZeContext; } + +void ur_context_handle_t_::deleteCachedObjectsOnDestruction() { + while (!KernelsCache.empty()) { + ur_kernel_handle_t &kernel = KernelsCache.front(); + uint32_t RefCount = kernel->RefCount.load(); + while (RefCount--) { + UR_CALL_THROWS(urKernelRelease(kernel)); + } + // kernel object should be deleted at this point, but this is a guard call + // to protect from infinite loop on case of error. + deleteFromCachedList(kernel, KernelsCache); + } +} diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index a1212f0698..d17f766fe3 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -23,6 +23,7 @@ #include #include "common.hpp" +#include "kernel.hpp" #include "queue.hpp" #include @@ -175,6 +176,8 @@ struct ur_context_handle_t_ : _ur_object { std::vector> EventCachesDeviceMap{4}; + std::list KernelsCache; + // Initialize the PI context. ur_result_t initialize(); @@ -309,6 +312,8 @@ struct ur_context_handle_t_ : _ur_object { // Get handle to the L0 context ze_context_handle_t getZeHandle() const; + void deleteCachedObjectsOnDestruction(); + private: // Get the cache of events for a provided scope and profiling mode. auto getEventCache(bool HostVisible, bool WithProfiling, diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 3469620b71..0cb916ce9a 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -591,6 +591,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( try { ur_kernel_handle_t_ *UrKernel = new ur_kernel_handle_t_(true, Program); *RetKernel = reinterpret_cast(UrKernel); + +#ifdef _WIN32 + auto &Context = Program->Context; + addToCachedList(*RetKernel, Context->KernelsCache); +#endif + } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -1085,6 +1091,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( } *RetKernel = reinterpret_cast(Kernel); +#ifdef _WIN32 + auto &Context = Program->Context; + addToCachedList(*RetKernel, Context->KernelsCache); +#endif } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) {