diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt old mode 100755 new mode 100644 index 32f625da6cb8b..6533112460e9a --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -15,6 +15,7 @@ else() set(LEVEL_ZERO_LOADER_TAG v1.8.8) endif() +# Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 set(CMAKE_INCLUDE_CURRENT_DIR OFF) message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}") @@ -57,7 +58,7 @@ add_sycl_plugin(level_zero UnifiedRuntime-Headers ze_loader Threads::Threads - pi_unified_runtime + unified_runtime_static ) find_package(Python3 REQUIRED) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 3fb5ccd370669..2ed917a017d4c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -90,18 +90,12 @@ static const bool ReuseDiscardedEvents = [] { return std::stoi(ReuseDiscardedEventsFlag) > 0; }(); -// Controls PI level tracing prints. -static bool PrintPiTrace = false; - // Controls support of the indirect access kernels and deferred memory release. static const bool IndirectAccessTrackingEnabled = [] { return std::getenv("SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY") != nullptr; }(); -// This will count the calls to Level-Zero -static std::map *ZeCallCount = nullptr; - // Map from L0 to PI result static inline pi_result mapError(ze_result_t Result) { return ur2piResult(ze2urResult(Result)); @@ -118,7 +112,7 @@ static inline pi_result mapError(ze_result_t Result) { // Trace an internal PI call; returns in case of an error. #define PI_CALL(Call) \ { \ - if (PrintPiTrace) \ + if (PrintTrace) \ fprintf(stderr, "PI ---> %s\n", #Call); \ pi_result Result = (Call); \ if (Result != PI_SUCCESS) \ @@ -352,15 +346,6 @@ static bool CopyEngineRequested(pi_device Device) { } // Global variables used in PI_Level_Zero -// Note we only create a simple pointer variables such that C++ RT won't -// deallocate them automatically at the end of the main program. -// The heap memory allocated for these global variables reclaimed only when -// Sycl RT calls piTearDown(). -static std::vector *PiPlatformsCache = - new std::vector; -static sycl::detail::SpinLock *PiPlatformsCacheMutex = - new sycl::detail::SpinLock; -static bool PiPlatformCachePopulated = false; pi_result _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, @@ -470,55 +455,6 @@ static pi_result enqueueMemCopyRectHelper( const pi_event *EventWaitList, pi_event *Event, bool PreferCopyEngine = false); -inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) { - switch (ZeError) { -#define ZE_ERRCASE(ERR) \ - case ERR: \ - ErrorString = "" #ERR; \ - break; - - ZE_ERRCASE(ZE_RESULT_SUCCESS) - ZE_ERRCASE(ZE_RESULT_NOT_READY) - ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST) - ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY) - ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) - ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) - ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS) - ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE) - ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE) - ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION) - ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE) - ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS) - ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED) - ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN) - -#undef ZE_ERRCASE - default: - assert(false && "Unexpected Error code"); - } // switch -} - // Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR constexpr size_t MaxMessageSize = 256; thread_local pi_result ErrorMessageCode = PI_SUCCESS; @@ -538,26 +474,6 @@ pi_result piPluginGetLastError(char **message) { return ErrorMessageCode; } -ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, - const char *ZeArgs, bool TraceError) { - zePrint("ZE ---> %s%s\n", ZeName, ZeArgs); - - if (ZeDebug & ZE_DEBUG_CALL_COUNT) { - ++(*ZeCallCount)[ZeName]; - } - - if (ZeResult && TraceError) { - const char *ErrorString = "Unknown"; - zeParseError(ZeResult, ErrorString); - zePrint("Error (%s) in %s\n", ErrorString, ZeName); - } - return ZeResult; -} - -#define PI_ASSERT(condition, error) \ - if (!(condition)) \ - return error; - bool _pi_queue::doReuseDiscardedEvents() { return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents(); } @@ -1580,7 +1496,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // traces incurs much different timings than real execution // ansyway, and many regression tests use it. // - bool CurrentlyEmpty = !PrintPiTrace && this->LastCommandEvent == nullptr; + bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr; // The list can be empty if command-list only contains signals of proxy // events. It is possible that executeCommandList is called twice for the same @@ -2238,117 +2154,7 @@ checkUnresolvedSymbols(ze_module_handle_t ZeModule, pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { - - static const char *PiTrace = std::getenv("SYCL_PI_TRACE"); - static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0; - if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces - PrintPiTrace = true; - } - - static std::once_flag ZeCallCountInitialized; - try { - std::call_once(ZeCallCountInitialized, []() { - if (ZeDebug & ZE_DEBUG_CALL_COUNT) { - ZeCallCount = new std::map; - } - }); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (NumEntries == 0 && Platforms != nullptr) { - return PI_ERROR_INVALID_VALUE; - } - if (Platforms == nullptr && NumPlatforms == nullptr) { - return PI_ERROR_INVALID_VALUE; - } - - // Setting these environment variables before running zeInit will enable the - // validation layer in the Level Zero loader. - if (ZeDebug & ZE_DEBUG_VALIDATION) { - setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); - setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); - } - - // Enable SYSMAN support for obtaining the PCI address - // and maximum memory bandwidth. - if (getenv("SYCL_ENABLE_PCI") != nullptr) { - setEnvVar("ZES_ENABLE_SYSMAN", "1"); - } - - // TODO: We can still safely recover if something goes wrong during the init. - // Implement handling segfault using sigaction. - - // We must only initialize the driver once, even if piPlatformsGet() is called - // multiple times. Declaring the return value as "static" ensures it's only - // called once. - static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0)); - - // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. - if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { - PI_ASSERT(NumPlatforms != 0, PI_ERROR_INVALID_VALUE); - *NumPlatforms = 0; - return PI_SUCCESS; - } - - if (ZeResult != ZE_RESULT_SUCCESS) { - zePrint("zeInit: Level Zero initialization failure\n"); - return mapError(ZeResult); - } - - // Cache pi_platforms for reuse in the future - // It solves two problems; - // 1. sycl::platform equality issue; we always return the same pi_platform. - // 2. performance; we can save time by immediately return from cache. - // - - const std::lock_guard Lock{*PiPlatformsCacheMutex}; - if (!PiPlatformCachePopulated) { - try { - // Level Zero does not have concept of Platforms, but Level Zero driver is - // the closest match. - uint32_t ZeDriverCount = 0; - ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); - if (ZeDriverCount == 0) { - PiPlatformCachePopulated = true; - } else { - std::vector ZeDrivers; - ZeDrivers.resize(ZeDriverCount); - - ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); - for (uint32_t I = 0; I < ZeDriverCount; ++I) { - pi_platform Platform = new _pi_platform(ZeDrivers[I]); - // Save a copy in the cache for future uses. - PiPlatformsCache->push_back(Platform); - - pi_result Result = Platform->initialize(); - if (Result != PI_SUCCESS) { - return Result; - } - } - PiPlatformCachePopulated = true; - } - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - } - - // Populate returned platforms from the cache. - if (Platforms) { - PI_ASSERT(NumEntries <= PiPlatformsCache->size(), - PI_ERROR_INVALID_PLATFORM); - std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms); - } - - if (NumPlatforms) { - *NumPlatforms = PiPlatformsCache->size(); - } - - return PI_SUCCESS; + return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms); } pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, @@ -3368,10 +3174,10 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle, // TODO: maybe we should populate cache of platforms if it wasn't already. // For now assert that is was populated. PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE); - const std::lock_guard Lock{*PiPlatformsCacheMutex}; + const std::lock_guard Lock{*PiPlatformsCacheMutex}; pi_device Dev = nullptr; - for (auto &ThePlatform : *PiPlatformsCache) { + for (pi_platform ThePlatform : *PiPlatformsCache) { Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice); if (Dev) { // Check that the input Platform, if was given, matches the found one. @@ -8781,7 +8587,7 @@ pi_result piTearDown(void *PluginParameter) { (void)PluginParameter; bool LeakFound = false; // reclaim pi_platform objects here since we don't have piPlatformRelease. - for (pi_platform &Platform : *PiPlatformsCache) { + for (pi_platform Platform : *PiPlatformsCache) { delete Platform; } delete PiPlatformsCache; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 034862c9ecbc1..6d82fed43fdaa 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -40,9 +40,9 @@ #include #include +#include #include #include -#include // Share code between this PI L0 Plugin and UR L0 Adapter #include @@ -207,6 +207,8 @@ struct _pi_platform : public _ur_level_zero_platform { pi_shared_mutex ContextsMutex; }; +struct _zer_platform_handle_t : public _pi_platform {}; + // Implements memory allocation via L0 RT for USM allocator interface. class USMMemoryAllocBase : public SystemMemory { protected: diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt old mode 100755 new mode 100644 index d6cb646e6155d..5dd82f3491355 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -31,6 +31,35 @@ list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS unified-runtime) find_package(Threads REQUIRED) +# +# Build a helper static library to carry pieces shared between +# this Unified Runtime plugin and Level Zero plugin. We cannot +# use dynamic plugin library as that only exports pi* symbols. +# +add_library(unified_runtime_static STATIC + "${sycl_inc_dir}/sycl/detail/pi.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ur.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ur.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/pi2ur.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/adapters/level_zero/ur_level_zero.cpp" +) + +target_include_directories(unified_runtime_static + INTERFACE + "${UNIFIED_RUNTIME_INCLUDE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" + PRIVATE + # For include + # TODO: how to get rid of this? + "${CMAKE_SOURCE_DIR}/../sycl/include" +) + +target_link_libraries(unified_runtime_static PRIVATE + UnifiedRuntime-Headers + LevelZeroLoader-Headers +) + # # NOTE: the Unified Runtime doesn't have the loader [yet]. # So what we really build is the Unified Runtime with Level Zero Adapter @@ -38,15 +67,13 @@ find_package(Threads REQUIRED) # add_sycl_plugin(unified_runtime SOURCES - "${sycl_inc_dir}/sycl/detail/pi.h" - "pi2ur.cpp" - "pi2ur.hpp" - "ur.hpp" - "adapters/level_zero/ur_level_zero.hpp" - "adapters/level_zero/ur_level_zero.cpp" + # Put here anything that belongs exclusively to Unified Runtime + # and should not be shared with the Level Zero plugin + "${CMAKE_CURRENT_SOURCE_DIR}/pi2ur.cpp" INCLUDE_DIRS "${UNIFIED_RUNTIME_INCLUDE_DIR}" LIBRARIES + unified_runtime_static Threads::Threads UnifiedRuntime-Headers LevelZeroLoader-Headers diff --git a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp index 71d184d29a8a8..c18f540f7a783 100644 --- a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.cpp @@ -6,6 +6,7 @@ // //===-----------------------------------------------------------------===// +#include #include #include "ur_level_zero.hpp" @@ -49,6 +50,74 @@ bool setEnvVar(const char *name, const char *value) { return ze2urResult(Result); \ } +// This will count the calls to Level-Zero +std::map *ZeCallCount = nullptr; + +inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) { + switch (ZeError) { +#define ZE_ERRCASE(ERR) \ + case ERR: \ + ErrorString = "" #ERR; \ + break; + + ZE_ERRCASE(ZE_RESULT_SUCCESS) + ZE_ERRCASE(ZE_RESULT_NOT_READY) + ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST) + ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY) + ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) + ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) + ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS) + ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE) + ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE) + ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED) + ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN) + +#undef ZE_ERRCASE + default: + assert(false && "Unexpected Error code"); + } // switch +} + +ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, + const char *ZeArgs, bool TraceError) { + zePrint("ZE ---> %s%s\n", ZeName, ZeArgs); + + if (ZeDebug & ZE_DEBUG_CALL_COUNT) { + ++(*ZeCallCount)[ZeName]; + } + + if (ZeResult && TraceError) { + const char *ErrorString = "Unknown"; + zeParseError(ZeResult, ErrorString); + zePrint("Error (%s) in %s\n", ErrorString, ZeName); + } + return ZeResult; +} + // Specializations for various L0 structures template <> ze_structure_type_t getZeStructureType() { return ZE_STRUCTURE_TYPE_EVENT_POOL_DESC; @@ -204,3 +273,119 @@ zer_result_t _ur_level_zero_platform::initialize() { return ZER_RESULT_SUCCESS; } + +ZER_APIEXPORT zer_result_t ZER_APICALL zerPlatformGet( + uint32_t + *NumPlatforms, ///< [in,out] pointer to the number of platforms. + ///< if count is zero, then the call shall update the + ///< value with the total number of platforms available. + ///< if count is greater than the number of platforms + ///< available, then the call shall update the value with + ///< the correct number of platforms available. + zer_platform_handle_t + *Platforms ///< [out][optional][range(0, *pCount)] array of handle of + ///< platforms. if count is less than the number of platforms + ///< available, then platform shall only retrieve that number + ///< of platforms. +) { + PI_ASSERT(NumPlatforms, ZER_RESULT_INVALID_VALUE); + + static std::once_flag ZeCallCountInitialized; + try { + std::call_once(ZeCallCountInitialized, []() { + if (ZeDebug & ZE_DEBUG_CALL_COUNT) { + ZeCallCount = new std::map; + } + }); + } catch (const std::bad_alloc &) { + return ZER_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return ZER_RESULT_ERROR_UNKNOWN; + } + + // Setting these environment variables before running zeInit will enable the + // validation layer in the Level Zero loader. + if (ZeDebug & ZE_DEBUG_VALIDATION) { + setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); + setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); + } + + // Enable SYSMAN support for obtaining the PCI address + // and maximum memory bandwidth. + if (getenv("SYCL_ENABLE_PCI") != nullptr) { + setEnvVar("ZES_ENABLE_SYSMAN", "1"); + } + + // TODO: We can still safely recover if something goes wrong during the init. + // Implement handling segfault using sigaction. + + // We must only initialize the driver once, even if piPlatformsGet() is called + // multiple times. Declaring the return value as "static" ensures it's only + // called once. + static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0)); + + // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. + if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + PI_ASSERT(NumPlatforms != 0, ZER_RESULT_INVALID_VALUE); + *NumPlatforms = 0; + return ZER_RESULT_SUCCESS; + } + + if (ZeResult != ZE_RESULT_SUCCESS) { + zePrint("zeInit: Level Zero initialization failure\n"); + return ze2urResult(ZeResult); + } + + // Cache pi_platforms for reuse in the future + // It solves two problems; + // 1. sycl::platform equality issue; we always return the same pi_platform. + // 2. performance; we can save time by immediately return from cache. + // + + const std::lock_guard Lock{*PiPlatformsCacheMutex}; + if (!PiPlatformCachePopulated) { + try { + // Level Zero does not have concept of Platforms, but Level Zero driver is + // the closest match. + uint32_t ZeDriverCount = 0; + ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); + if (ZeDriverCount == 0) { + PiPlatformCachePopulated = true; + } else { + std::vector ZeDrivers; + ZeDrivers.resize(ZeDriverCount); + + ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); + for (uint32_t I = 0; I < ZeDriverCount; ++I) { + auto Platform = new _ur_level_zero_platform(ZeDrivers[I]); + // Save a copy in the cache for future uses. + PiPlatformsCache->push_back((zer_platform_handle_t)Platform); + + zer_result_t Result = Platform->initialize(); + if (Result != ZER_RESULT_SUCCESS) { + return Result; + } + } + PiPlatformCachePopulated = true; + } + } catch (const std::bad_alloc &) { + return ZER_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return ZER_RESULT_ERROR_UNKNOWN; + } + } + + // Populate returned platforms from the cache. + if (Platforms) { + PI_ASSERT(*NumPlatforms <= PiPlatformsCache->size(), + ZER_RESULT_INVALID_PLATFORM); + std::copy_n(PiPlatformsCache->begin(), *NumPlatforms, Platforms); + } + + if (*NumPlatforms == 0) + *NumPlatforms = PiPlatformsCache->size(); + else + *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)*NumPlatforms); + + return ZER_RESULT_SUCCESS; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp old mode 100755 new mode 100644 index 86389e5787b1b..36cce6cdf3ba4 --- a/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp +++ b/sycl/plugins/unified_runtime/adapters/level_zero/ur_level_zero.hpp @@ -7,6 +7,7 @@ //===-----------------------------------------------------------------===// #pragma once +#include #include #include #include @@ -14,10 +15,10 @@ #include #include -#include -#include #include +#include #include +#include // Returns the ze_structure_type_t to use in .stype of a structured descriptor. // Intentionally not defined; will give an error if no proper specialization @@ -216,3 +217,6 @@ class ZeUSMImportExtension { }; extern ZeUSMImportExtension ZeUSMImport; + +// This will count the calls to Level-Zero +extern std::map *ZeCallCount; \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/pi2ur.cpp b/sycl/plugins/unified_runtime/pi2ur.cpp index a10ca7534ef14..ed13084ed12fe 100644 --- a/sycl/plugins/unified_runtime/pi2ur.cpp +++ b/sycl/plugins/unified_runtime/pi2ur.cpp @@ -10,32 +10,11 @@ // TODO: remove when SYCL RT is changed to talk in UR directly #include -#include - -// Early exits on any error -#define HANDLE_ERRORS(urCall) \ - if (auto Result = urCall) \ - return ur2piResult(Result); +extern "C" { __SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, pi_uint32 *num_platforms) { - - // https://spec.oneapi.io/unified-runtime/latest/core/api.html#zerplatformget - - uint32_t Count = num_entries; - auto phPlatforms = reinterpret_cast(platforms); - HANDLE_ERRORS(zerPlatformGet(&Count, phPlatforms)); - if (*num_platforms) { - *num_platforms = Count; - } - return PI_SUCCESS; -} - -__SYCL_EXPORT pi_result piPlatformGetInfo(pi_platform platform, - pi_platform_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) { - die("Unified Runtime: piPlatformGetInfo is not implemented"); + return pi2ur::piPlatformsGet(num_entries, platforms, num_platforms); } +} // extern "C \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp old mode 100755 new mode 100644 index d461bac4a2cfb..4f00e4d5d1c04 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -11,6 +11,7 @@ #include "zer_api.h" #include +#include // Map of UR error codes to PI error codes static pi_result ur2piResult(zer_result_t urResult) { @@ -22,6 +23,7 @@ static pi_result ur2piResult(zer_result_t urResult) { // static std::unordered_map ErrorMapping = { {ZER_RESULT_SUCCESS, PI_SUCCESS}, + {ZER_RESULT_ERROR_UNKNOWN, PI_ERROR_UNKNOWN}, {ZER_RESULT_ERROR_DEVICE_LOST, PI_ERROR_DEVICE_NOT_FOUND}, {ZER_RESULT_INVALID_OPERATION, PI_ERROR_INVALID_OPERATION}, {ZER_RESULT_INVALID_PLATFORM, PI_ERROR_INVALID_PLATFORM}, @@ -42,3 +44,33 @@ static pi_result ur2piResult(zer_result_t urResult) { } return It->second; } + +// Early exits on any error +#define HANDLE_ERRORS(urCall) \ + if (auto Result = urCall) \ + return ur2piResult(Result); + +namespace pi2ur { +__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries, + pi_platform *platforms, + pi_uint32 *num_platforms) { + + // https://spec.oneapi.io/unified-runtime/latest/core/api.html#zerplatformget + + uint32_t Count = num_entries; + auto phPlatforms = reinterpret_cast(platforms); + HANDLE_ERRORS(zerPlatformGet(&Count, phPlatforms)); + if (*num_platforms) { + *num_platforms = Count; + } + return PI_SUCCESS; +} + +__SYCL_EXPORT pi_result piPlatformGetInfo(pi_platform platform, + pi_platform_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + die("Unified Runtime: piPlatformGetInfo is not implemented"); +} +} // namespace pi2ur \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur.cpp b/sycl/plugins/unified_runtime/ur.cpp new file mode 100644 index 0000000000000..874a78a3ff415 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur.cpp @@ -0,0 +1,26 @@ + +//===--------- ur.hpp - Unified Runtime -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include + +// Controls tracing UR calls from within the UR itself. +bool PrintTrace = [] { + const char *Trace = std::getenv("SYCL_PI_TRACE"); + const int TraceValue = Trace ? std::stoi(Trace) : 0; + if (TraceValue == -1 || TraceValue == 2) { // Means print all traces + return true; + } + return false; +}(); + +// Apparatus for maintaining immutable cache of platforms. +std::vector *PiPlatformsCache = + new std::vector; +SpinLock *PiPlatformsCacheMutex = new SpinLock; +bool PiPlatformCachePopulated = false; diff --git a/sycl/plugins/unified_runtime/ur.hpp b/sycl/plugins/unified_runtime/ur.hpp old mode 100755 new mode 100644 index 63b4a54bb1ffd..89bd0551d65de --- a/sycl/plugins/unified_runtime/ur.hpp +++ b/sycl/plugins/unified_runtime/ur.hpp @@ -7,9 +7,12 @@ //===-----------------------------------------------------------------===// #pragma once +#include #include #include #include +#include +#include #include @@ -32,15 +35,14 @@ static const bool SingleThreadMode = [] { // If SingleThreadMode variable is set then mutex operations are turned into // nop. class pi_shared_mutex { - std::shared_mutex Mutex; + std::shared_mutex Mutex; + public: void lock() { if (!SingleThreadMode) Mutex.lock(); } - bool try_lock() { - return SingleThreadMode ? true : Mutex.try_lock(); - } + bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } void unlock() { if (!SingleThreadMode) Mutex.unlock(); @@ -64,6 +66,7 @@ class pi_shared_mutex { // nop. class pi_mutex { std::mutex Mutex; + public: void lock() { if (!SingleThreadMode) @@ -76,5 +79,43 @@ class pi_mutex { } }; +/// SpinLock is a synchronization primitive, that uses atomic variable and +/// causes thread trying acquire lock wait in loop while repeatedly check if +/// the lock is available. +/// +/// One important feature of this implementation is that std::atomic can +/// be zero-initialized. This allows SpinLock to have trivial constructor and +/// destructor, which makes it possible to use it in global context (unlike +/// std::mutex, that doesn't provide such guarantees). +class SpinLock { +public: + void lock() { + while (MLock.test_and_set(std::memory_order_acquire)) + std::this_thread::yield(); + } + void unlock() { MLock.clear(std::memory_order_release); } + +private: + std::atomic_flag MLock = ATOMIC_FLAG_INIT; +}; + +// Helper for one-liner validation +#define PI_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + // TODO: populate with target agnostic handling of UR platforms struct _ur_platform {}; + +// Controls tracing UR calls from within the UR itself. +extern bool PrintTrace; + +// Apparatus for maintaining immutable cache of platforms. +// +// Note we only create a simple pointer variables such that C++ RT won't +// deallocate them automatically at the end of the main program. +// The heap memory allocated for these global variables reclaimed only at +// explicit tear-down. +extern std::vector *PiPlatformsCache; +extern SpinLock *PiPlatformsCacheMutex; +extern bool PiPlatformCachePopulated;