diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index 19f3ae83a9..13f22a9789 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -123,6 +123,11 @@ jobs: ref: ${{ inputs.pytorch_ref }} mode: ${{ inputs.pytorch_mode }} + - name: Build Proton with XPU support + if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100' + run: | + echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV + - name: Setup Triton uses: ./.github/actions/setup-triton with: diff --git a/python/setup.py b/python/setup.py index 15019f104b..340f951689 100644 --- a/python/setup.py +++ b/python/setup.py @@ -502,6 +502,7 @@ def build_extension(self, ext): # environment variables we will pass through to cmake passthrough_args = [ "TRITON_BUILD_PROTON", + "TRITON_BUILD_PROTON_XPU", "TRITON_BUILD_WITH_CCACHE", "TRITON_PARALLEL_LINK_JOBS", ] diff --git a/third_party/intel/backend/driver.py b/third_party/intel/backend/driver.py index 85ec2ba9c7..f76cfce671 100644 --- a/third_party/intel/backend/driver.py +++ b/third_party/intel/backend/driver.py @@ -279,6 +279,8 @@ def compile_module_from_src(src, name): return SpirvUtils(cache_path) elif name == '__triton_launcher': return TritonLauncher(cache_path) + elif name == 'proton_utils': + return cache_path import importlib.util spec = importlib.util.spec_from_file_location(name, cache_path) @@ -744,6 +746,12 @@ def get_current_target(self): warp_size = 32 return GPUTarget("xpu", dev_property, warp_size) + def build_proton_help_lib(self): + from triton.backends.intel.driver import compile_module_from_src + + dirname = os.path.dirname(os.path.realpath(__file__)) + return compile_module_from_src(Path(dirname).joinpath("proton_utils.cpp").read_text(), "proton_utils") + def get_active_torch_device(self): import torch return torch.device("xpu", self.get_current_device()) diff --git a/third_party/intel/backend/proton_utils.cpp b/third_party/intel/backend/proton_utils.cpp new file mode 100644 index 0000000000..15f2c15122 --- /dev/null +++ b/third_party/intel/backend/proton_utils.cpp @@ -0,0 +1,36 @@ +#include + +extern "C" void waitOnSyclQueue(void *syclQueue) { + sycl::queue *queue = static_cast(syclQueue); + queue->wait(); +} + +// FIXME: Should it be in DeviceInfo class? +// Inspired by Kineto: `XpuptiActivityProfiler.cpp` +extern "C" void +enumDeviceUUIDs(std::vector> deviceUUIDs_) { + if (!deviceUUIDs_.empty()) { + return; + } + auto platform_list = sycl::platform::get_platforms(); + // Enumerated GPU devices from the specific platform. + for (const auto &platform : platform_list) { + if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) { + continue; + } + auto device_list = platform.get_devices(); + for (const auto &device : device_list) { + if (device.is_gpu()) { + if (device.has(sycl::aspect::ext_intel_device_info_uuid)) { + deviceUUIDs_.push_back( + device.get_info()); + } else { + std::cerr << "Warnings: UUID is not supported for this XPU device. " + "The device index of records will be 0." + << std::endl; + deviceUUIDs_.push_back(std::array{}); + } + } + } + } +} diff --git a/third_party/proton/CMakeLists.txt b/third_party/proton/CMakeLists.txt index e258f5e9ed..e045be5fc3 100644 --- a/third_party/proton/CMakeLists.txt +++ b/third_party/proton/CMakeLists.txt @@ -2,6 +2,8 @@ project(Proton LANGUAGES CXX) set(PROTON_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/csrc") +option(TRITON_BUILD_PROTON_XPU "Build Proton with XPU support" OFF) + # ============ Check for includes ============= if(NOT CUPTI_INCLUDE_DIR) message(FATAL_ERROR "CUPTI include directory not defined") @@ -11,22 +13,6 @@ if(NOT ROCTRACER_INCLUDE_DIR) endif() if (NOT XPUPTI_INCLUDE_DIR) message(FATAL_ERROR "XPUPTI include directory not defined") -else() - option(ENABLE_XPUTPI_PROFILER "Enable Xpupti Profiler" ON) - set(ENABLE_XPUTPI_PROFILER ${ENABLE_XPUTPI_PROFILER} CACHE BOOL "Enable Xpupti Profiler" FORCE) - - find_path(SYCL_INCLUDE_DIR - NAMES "sycl/sycl.hpp" - HINTS "/opt/intel/oneapi/compiler/latest/include" - ) - - if(SYCL_INCLUDE_DIR) - list(APPEND XPUPTI_INCLUDE_DIR "${SYCL_INCLUDE_DIR}") - list(APPEND XPUPTI_INCLUDE_DIR "${SYCL_INCLUDE_DIR}/sycl") - else() - message(WARNING "SYCL FOLDER not found") - set(ENABLE_XPUTPI_PROFILER, OFF) - endif() endif() if(NOT JSON_INCLUDE_DIR) message(FATAL_ERROR "JSON include directory not defined") @@ -92,16 +78,9 @@ endforeach() add_library(proton SHARED ${_proton_obj_sources}) target_link_libraries(proton PRIVATE Python3::Module) -find_library(SYCL_LIB - NAMES sycl - HINTS "/opt/intel/oneapi/compiler/latest/lib" -) find_library(ZE_TRACING_LIB NAMES ze_tracing_layer PATHS /usr/lib/x86_64-linux-gnu) find_library(ZE_LOADER_LIB NAMES ze_loader PATHS /usr/lib/x86_64-linux-gnu) -if(SYCL_LIB) - target_link_libraries(proton PRIVATE ${SYCL_LIB}) -endif() if(ZE_TRACING_LIB) target_link_libraries(proton PRIVATE ${ZE_TRACING_LIB}) endif() diff --git a/third_party/proton/csrc/Proton.cpp b/third_party/proton/csrc/Proton.cpp index 4baa99060b..d646a6c4f8 100644 --- a/third_party/proton/csrc/Proton.cpp +++ b/third_party/proton/csrc/Proton.cpp @@ -2,6 +2,7 @@ #include #include +#include #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -17,17 +18,19 @@ static void initProton(pybind11::module &&m) { "start", [](const std::string &path, const std::string &contextSourceName, const std::string &dataName, const std::string &profilerName, - const std::string &profilerPath, long sycl_queue) { + const std::string &profilerPath, long sycl_queue, + const std::string &utils_cache_path) { void *queue = reinterpret_cast(sycl_queue); auto sessionId = SessionManager::instance().addSession( path, profilerName, profilerPath, contextSourceName, dataName, - queue); + queue, utils_cache_path); SessionManager::instance().activateSession(sessionId); return sessionId; }, pybind11::arg("path"), pybind11::arg("contextSourceName"), pybind11::arg("dataName"), pybind11::arg("profilerName"), - pybind11::arg("profilerPath"), pybind11::arg("sycl_queue") = 0); + pybind11::arg("profilerPath"), pybind11::arg("sycl_queue") = 0, + pybind11::arg("utils_cache_path") = ""); m.def("activate", [](size_t sessionId) { SessionManager::instance().activateSession(sessionId); diff --git a/third_party/proton/csrc/include/Profiler/GPUProfiler.h b/third_party/proton/csrc/include/Profiler/GPUProfiler.h index 405a114bc4..acf84daeee 100644 --- a/third_party/proton/csrc/include/Profiler/GPUProfiler.h +++ b/third_party/proton/csrc/include/Profiler/GPUProfiler.h @@ -52,6 +52,11 @@ class GPUProfiler : public Profiler, return dynamic_cast(*this); } + ConcreteProfilerT &setUtilsCachePath(const std::string &utils_cache_path) { + this->utils_cache_path = utils_cache_path; + return dynamic_cast(*this); + } + protected: // OpInterface void startOp(const Scope &scope) override { @@ -139,6 +144,7 @@ class GPUProfiler : public Profiler, static thread_local ThreadState threadState; Correlation correlation; void *syclQueue; + std::string utils_cache_path; // Use the pimpl idiom to hide the implementation details. This lets us avoid // including the cupti header from this header. The cupti header and the diff --git a/third_party/proton/csrc/include/Session/Session.h b/third_party/proton/csrc/include/Session/Session.h index 69e505e662..9d4d132fbe 100644 --- a/third_party/proton/csrc/include/Session/Session.h +++ b/third_party/proton/csrc/include/Session/Session.h @@ -76,7 +76,8 @@ class SessionManager : public Singleton { size_t addSession(const std::string &path, const std::string &profilerName, const std::string &profilerPath, const std::string &contextSourceName, - const std::string &dataName, void *sycl_queue); + const std::string &dataName, void *sycl_queue, + const std::string &utils_cache_path); void finalizeSession(size_t sessionId, OutputFormat outputFormat); @@ -106,12 +107,11 @@ class SessionManager : public Singleton { void setState(std::optional context); private: - std::unique_ptr makeSession(size_t id, const std::string &path, - const std::string &profilerName, - const std::string &profilerPath, - const std::string &contextSourceName, - const std::string &dataName, - void *sycl_queue); + std::unique_ptr + makeSession(size_t id, const std::string &path, + const std::string &profilerName, const std::string &profilerPath, + const std::string &contextSourceName, const std::string &dataName, + void *sycl_queue, const std::string &utils_cache_path); void activateSessionImpl(size_t sessionId); diff --git a/third_party/proton/csrc/lib/Driver/CMakeLists.txt b/third_party/proton/csrc/lib/Driver/CMakeLists.txt index dae896e20f..166a97b500 100644 --- a/third_party/proton/csrc/lib/Driver/CMakeLists.txt +++ b/third_party/proton/csrc/lib/Driver/CMakeLists.txt @@ -1,10 +1,15 @@ -add_proton_library(ProtonDriver +set(SOURCE_FILES Device.cpp GPU/CudaApi.cpp GPU/CuptiApi.cpp GPU/HipApi.cpp GPU/HsaApi.cpp GPU/RoctracerApi.cpp - GPU/XpuApi.cpp - GPU/XpuptiApi.cpp ) + +if(TRITON_BUILD_PROTON_XPU) + list(APPEND SOURCE_FILES GPU/XpuApi.cpp) + list(APPEND SOURCE_FILES GPU/XpuptiApi.cpp) +endif() + +add_proton_library(ProtonDriver ${SOURCE_FILES}) diff --git a/third_party/proton/csrc/lib/Profiler/CMakeLists.txt b/third_party/proton/csrc/lib/Profiler/CMakeLists.txt index 58de4053b8..a090ab2704 100644 --- a/third_party/proton/csrc/lib/Profiler/CMakeLists.txt +++ b/third_party/proton/csrc/lib/Profiler/CMakeLists.txt @@ -4,7 +4,7 @@ set(SOURCE_FILES RocTracer/RoctracerProfiler.cpp ) -if(ENABLE_XPUTPI_PROFILER) +if(TRITON_BUILD_PROTON_XPU) list(APPEND SOURCE_FILES Xpupti/XpuptiProfiler.cpp) endif() diff --git a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp index 4e1fd1c52a..ee407311b5 100644 --- a/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp @@ -7,9 +7,14 @@ #include "Utility/Map.h" #include "pti/pti_view.h" +#include +#include #include #include -#include + +#include +#include +#include #include #include @@ -30,35 +35,6 @@ namespace { std::vector> deviceUUIDs_ = {}; -// FIXME: Should it be in DeviceInfo class? -// Inspired by Kineto: `XpuptiActivityProfiler.cpp` -void enumDeviceUUIDs() { - if (!deviceUUIDs_.empty()) { - return; - } - auto platform_list = sycl::platform::get_platforms(); - // Enumerated GPU devices from the specific platform. - for (const auto &platform : platform_list) { - if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) { - continue; - } - auto device_list = platform.get_devices(); - for (const auto &device : device_list) { - if (device.is_gpu()) { - if (device.has(sycl::aspect::ext_intel_device_info_uuid)) { - deviceUUIDs_.push_back( - device.get_info()); - } else { - std::cerr << "Warnings: UUID is not supported for this XPU device. " - "The device index of records will be 0." - << std::endl; - deviceUUIDs_.push_back(std::array{}); - } - } - } - } -} - uint8_t getDeviceIdxFromUUID(const uint8_t deviceUUID[16]) { std::array key; memcpy(key.data(), deviceUUID, 16); @@ -353,9 +329,63 @@ void XpuptiProfiler::XpuptiProfilerPimpl::completeBuffer(uint8_t *buffer, zel_tracer_handle_t tracer = nullptr; +typedef void (*EnumDeviceUUIDsFunc)(std::vector>); + +int callEnumDeviceUUIDs(const std::string &utils_cache_path) { + void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY); + if (!handle) { + std::cerr << "Failed to load library: " << dlerror() << std::endl; + return 1; + } + + dlerror(); + EnumDeviceUUIDsFunc enumDeviceUUIDs = + (EnumDeviceUUIDsFunc)dlsym(handle, "enumDeviceUUIDs"); + const char *dlsym_error = dlerror(); + if (dlsym_error) { + std::cerr << "Failed to load function: " << dlsym_error << std::endl; + dlclose(handle); + return 1; + } + + enumDeviceUUIDs(deviceUUIDs_); + + dlclose(handle); + return 0; +} + +typedef void (*WaitOnSyclQueueFunc)(void *); + +int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) { + void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY); + if (!handle) { + std::cerr << "Failed to load library: " << dlerror() << std::endl; + return 1; + } + + dlerror(); + WaitOnSyclQueueFunc waitOnSyclQueue = + (WaitOnSyclQueueFunc)dlsym(handle, "waitOnSyclQueue"); + const char *dlsym_error = dlerror(); + if (dlsym_error) { + std::cerr << "Failed to load function: " << dlsym_error << std::endl; + dlclose(handle); + return 1; + } + + waitOnSyclQueue(syclQueue); + + dlclose(handle); + return 0; +} + void XpuptiProfiler::XpuptiProfilerPimpl::doStart() { // xpupti::subscribe(&subscriber, callbackFn, nullptr); - enumDeviceUUIDs(); + // should be call to shared lib + XpuptiProfiler &profiler = threadState.profiler; + if (profiler.utils_cache_path != "") { + callEnumDeviceUUIDs(profiler.utils_cache_path); + } // auto res = ptiViewPushExternalCorrelationId( // pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_1, 42); // std::cout << "res: " << res << "\n" << std::flush; @@ -405,8 +435,7 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() { std::cout << "flush\n" << std::flush; XpuptiProfiler &profiler = threadState.profiler; if (profiler.syclQueue != nullptr) { - sycl::queue *syclQueue = static_cast(profiler.syclQueue); - syclQueue->wait(); + callWaitOnSyclQueue(profiler.utils_cache_path, profiler.syclQueue); } profiler.correlation.flush( diff --git a/third_party/proton/csrc/lib/Session/Session.cpp b/third_party/proton/csrc/lib/Session/Session.cpp index 1e9cc7dec7..253dc1fd28 100644 --- a/third_party/proton/csrc/lib/Session/Session.cpp +++ b/third_party/proton/csrc/lib/Session/Session.cpp @@ -11,7 +11,8 @@ namespace proton { namespace { Profiler *getProfiler(const std::string &name, const std::string &path, - void *sycl_queue = nullptr) { + void *sycl_queue = nullptr, + const std::string &utils_cache_path = "") { if (proton::toLower(name) == "cupti") { return &CuptiProfiler::instance().setLibPath(path); } @@ -19,7 +20,9 @@ Profiler *getProfiler(const std::string &name, const std::string &path, return &CuptiProfiler::instance().setLibPath(path).enablePCSampling(); } if (proton::toLower(name) == "xpupti") { - return &XpuptiProfiler::instance().setSyclQueue(sycl_queue); + return &XpuptiProfiler::instance() + .setSyclQueue(sycl_queue) + .setUtilsCachePath(utils_cache_path); } if (proton::toLower(name) == "roctracer") { return &RoctracerProfiler::instance(); @@ -79,8 +82,10 @@ size_t Session::getContextDepth() { return contextSource->getDepth(); } std::unique_ptr SessionManager::makeSession( size_t id, const std::string &path, const std::string &profilerName, const std::string &profilerPath, const std::string &contextSourceName, - const std::string &dataName, void *sycl_queue) { - auto profiler = getProfiler(profilerName, profilerPath, sycl_queue); + const std::string &dataName, void *sycl_queue, + const std::string &utils_cache_path) { + auto profiler = + getProfiler(profilerName, profilerPath, sycl_queue, utils_cache_path); auto contextSource = makeContextSource(contextSourceName); auto data = makeData(dataName, path, contextSource.get()); auto *session = new Session(id, path, profiler, std::move(contextSource), @@ -149,8 +154,8 @@ size_t SessionManager::addSession(const std::string &path, const std::string &profilerName, const std::string &profilerPath, const std::string &contextSourceName, - const std::string &dataName, - void *sycl_queue) { + const std::string &dataName, void *sycl_queue, + const std::string &utils_cache_path) { std::lock_guard lock(mutex); if (hasSession(path)) { auto sessionId = getSessionId(path); @@ -159,8 +164,9 @@ size_t SessionManager::addSession(const std::string &path, } auto sessionId = nextSessionId++; sessionPaths[path] = sessionId; - sessions[sessionId] = makeSession(sessionId, path, profilerName, profilerPath, - contextSourceName, dataName, sycl_queue); + sessions[sessionId] = + makeSession(sessionId, path, profilerName, profilerPath, + contextSourceName, dataName, sycl_queue, utils_cache_path); return sessionId; } diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py index 700051417d..7a201aa611 100644 --- a/third_party/proton/proton/profile.py +++ b/third_party/proton/proton/profile.py @@ -9,6 +9,7 @@ from typing import Optional DEFAULT_PROFILE_NAME = "proton" +UTILS_CACHE_PATH = None def _select_backend() -> str: @@ -18,6 +19,8 @@ def _select_backend() -> str: elif backend == "hip": return "roctracer" elif backend == "xpu": + global UTILS_CACHE_PATH + UTILS_CACHE_PATH = triton.runtime.driver.active.build_proton_help_lib() return "xpupti" else: raise ValueError("No backend is available for the current target.") @@ -101,9 +104,11 @@ def start( register_triton_hook() sycl_queue = 0 + utils_cache_path = "" if hasattr(triton.runtime.driver.active.utils, "get_sycl_queue"): sycl_queue = triton.runtime.driver.active.utils.get_sycl_queue() - return libproton.start(name, context, data, backend, backend_path, sycl_queue) + utils_cache_path = UTILS_CACHE_PATH + return libproton.start(name, context, data, backend, backend_path, sycl_queue, utils_cache_path) def activate(session: Optional[int] = None) -> None: