Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/build-test-reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ jobs:
ref: ${{ inputs.pytorch_ref }}
mode: ${{ inputs.pytorch_mode }}

- name: Build Proton with XPU support
if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
run: |
echo TRITON_BUILD_PROTON_XPU=1 | tee -a $GITHUB_ENV

- name: Setup Triton
uses: ./.github/actions/setup-triton
with:
Expand Down
1 change: 1 addition & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ def build_extension(self, ext):
# environment variables we will pass through to cmake
passthrough_args = [
"TRITON_BUILD_PROTON",
"TRITON_BUILD_PROTON_XPU",
"TRITON_BUILD_WITH_CCACHE",
"TRITON_PARALLEL_LINK_JOBS",
]
Expand Down
8 changes: 8 additions & 0 deletions third_party/intel/backend/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ def compile_module_from_src(src, name):
return SpirvUtils(cache_path)
elif name == '__triton_launcher':
return TritonLauncher(cache_path)
elif name == 'proton_utils':
return cache_path

import importlib.util
spec = importlib.util.spec_from_file_location(name, cache_path)
Expand Down Expand Up @@ -744,6 +746,12 @@ def get_current_target(self):
warp_size = 32
return GPUTarget("xpu", dev_property, warp_size)

def build_proton_help_lib(self):
from triton.backends.intel.driver import compile_module_from_src

dirname = os.path.dirname(os.path.realpath(__file__))
return compile_module_from_src(Path(dirname).joinpath("proton_utils.cpp").read_text(), "proton_utils")

def get_active_torch_device(self):
import torch
return torch.device("xpu", self.get_current_device())
Expand Down
36 changes: 36 additions & 0 deletions third_party/intel/backend/proton_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include <sycl/sycl.hpp>

extern "C" void waitOnSyclQueue(void *syclQueue) {
sycl::queue *queue = static_cast<sycl::queue *>(syclQueue);
queue->wait();
}

// FIXME: Should it be in DeviceInfo class?
// Inspired by Kineto: `XpuptiActivityProfiler.cpp`
extern "C" void
enumDeviceUUIDs(std::vector<std::array<uint8_t, 16>> deviceUUIDs_) {
if (!deviceUUIDs_.empty()) {
return;
}
auto platform_list = sycl::platform::get_platforms();
// Enumerated GPU devices from the specific platform.
for (const auto &platform : platform_list) {
if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
continue;
}
auto device_list = platform.get_devices();
for (const auto &device : device_list) {
if (device.is_gpu()) {
if (device.has(sycl::aspect::ext_intel_device_info_uuid)) {
deviceUUIDs_.push_back(
device.get_info<sycl::ext::intel::info::device::uuid>());
} else {
std::cerr << "Warnings: UUID is not supported for this XPU device. "
"The device index of records will be 0."
<< std::endl;
deviceUUIDs_.push_back(std::array<uint8_t, 16>{});
}
}
}
}
}
25 changes: 2 additions & 23 deletions third_party/proton/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ project(Proton LANGUAGES CXX)

set(PROTON_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/csrc")

option(TRITON_BUILD_PROTON_XPU "Build Proton with XPU support" OFF)

# ============ Check for includes =============
if(NOT CUPTI_INCLUDE_DIR)
message(FATAL_ERROR "CUPTI include directory not defined")
Expand All @@ -11,22 +13,6 @@ if(NOT ROCTRACER_INCLUDE_DIR)
endif()
if (NOT XPUPTI_INCLUDE_DIR)
message(FATAL_ERROR "XPUPTI include directory not defined")
else()
option(ENABLE_XPUTPI_PROFILER "Enable Xpupti Profiler" ON)
set(ENABLE_XPUTPI_PROFILER ${ENABLE_XPUTPI_PROFILER} CACHE BOOL "Enable Xpupti Profiler" FORCE)

find_path(SYCL_INCLUDE_DIR
NAMES "sycl/sycl.hpp"
HINTS "/opt/intel/oneapi/compiler/latest/include"
)

if(SYCL_INCLUDE_DIR)
list(APPEND XPUPTI_INCLUDE_DIR "${SYCL_INCLUDE_DIR}")
list(APPEND XPUPTI_INCLUDE_DIR "${SYCL_INCLUDE_DIR}/sycl")
else()
message(WARNING "SYCL FOLDER not found")
set(ENABLE_XPUTPI_PROFILER, OFF)
endif()
endif()
if(NOT JSON_INCLUDE_DIR)
message(FATAL_ERROR "JSON include directory not defined")
Expand Down Expand Up @@ -92,16 +78,9 @@ endforeach()
add_library(proton SHARED ${_proton_obj_sources})

target_link_libraries(proton PRIVATE Python3::Module)
find_library(SYCL_LIB
NAMES sycl
HINTS "/opt/intel/oneapi/compiler/latest/lib"
)
find_library(ZE_TRACING_LIB NAMES ze_tracing_layer PATHS /usr/lib/x86_64-linux-gnu)
find_library(ZE_LOADER_LIB NAMES ze_loader PATHS /usr/lib/x86_64-linux-gnu)
Comment on lines 81 to 82
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: need to move to runtime build or or disable proton build for XPU if there are no corresponding headers


if(SYCL_LIB)
target_link_libraries(proton PRIVATE ${SYCL_LIB})
endif()
if(ZE_TRACING_LIB)
target_link_libraries(proton PRIVATE ${ZE_TRACING_LIB})
endif()
Expand Down
9 changes: 6 additions & 3 deletions third_party/proton/csrc/Proton.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <map>
#include <stdexcept>
#include <string>

#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
Expand All @@ -17,17 +18,19 @@ static void initProton(pybind11::module &&m) {
"start",
[](const std::string &path, const std::string &contextSourceName,
const std::string &dataName, const std::string &profilerName,
const std::string &profilerPath, long sycl_queue) {
const std::string &profilerPath, long sycl_queue,
const std::string &utils_cache_path) {
void *queue = reinterpret_cast<void *>(sycl_queue);
auto sessionId = SessionManager::instance().addSession(
path, profilerName, profilerPath, contextSourceName, dataName,
queue);
queue, utils_cache_path);
SessionManager::instance().activateSession(sessionId);
return sessionId;
},
pybind11::arg("path"), pybind11::arg("contextSourceName"),
pybind11::arg("dataName"), pybind11::arg("profilerName"),
pybind11::arg("profilerPath"), pybind11::arg("sycl_queue") = 0);
pybind11::arg("profilerPath"), pybind11::arg("sycl_queue") = 0,
pybind11::arg("utils_cache_path") = "");

m.def("activate", [](size_t sessionId) {
SessionManager::instance().activateSession(sessionId);
Expand Down
6 changes: 6 additions & 0 deletions third_party/proton/csrc/include/Profiler/GPUProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ class GPUProfiler : public Profiler,
return dynamic_cast<ConcreteProfilerT &>(*this);
}

ConcreteProfilerT &setUtilsCachePath(const std::string &utils_cache_path) {
this->utils_cache_path = utils_cache_path;
return dynamic_cast<ConcreteProfilerT &>(*this);
}

protected:
// OpInterface
void startOp(const Scope &scope) override {
Expand Down Expand Up @@ -139,6 +144,7 @@ class GPUProfiler : public Profiler,
static thread_local ThreadState threadState;
Correlation correlation;
void *syclQueue;
std::string utils_cache_path;

// Use the pimpl idiom to hide the implementation details. This lets us avoid
// including the cupti header from this header. The cupti header and the
Expand Down
14 changes: 7 additions & 7 deletions third_party/proton/csrc/include/Session/Session.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ class SessionManager : public Singleton<SessionManager> {
size_t addSession(const std::string &path, const std::string &profilerName,
const std::string &profilerPath,
const std::string &contextSourceName,
const std::string &dataName, void *sycl_queue);
const std::string &dataName, void *sycl_queue,
const std::string &utils_cache_path);

void finalizeSession(size_t sessionId, OutputFormat outputFormat);

Expand Down Expand Up @@ -106,12 +107,11 @@ class SessionManager : public Singleton<SessionManager> {
void setState(std::optional<Context> context);

private:
std::unique_ptr<Session> makeSession(size_t id, const std::string &path,
const std::string &profilerName,
const std::string &profilerPath,
const std::string &contextSourceName,
const std::string &dataName,
void *sycl_queue);
std::unique_ptr<Session>
makeSession(size_t id, const std::string &path,
const std::string &profilerName, const std::string &profilerPath,
const std::string &contextSourceName, const std::string &dataName,
void *sycl_queue, const std::string &utils_cache_path);

void activateSessionImpl(size_t sessionId);

Expand Down
11 changes: 8 additions & 3 deletions third_party/proton/csrc/lib/Driver/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
add_proton_library(ProtonDriver
set(SOURCE_FILES
Device.cpp
GPU/CudaApi.cpp
GPU/CuptiApi.cpp
GPU/HipApi.cpp
GPU/HsaApi.cpp
GPU/RoctracerApi.cpp
GPU/XpuApi.cpp
GPU/XpuptiApi.cpp
)

if(TRITON_BUILD_PROTON_XPU)
list(APPEND SOURCE_FILES GPU/XpuApi.cpp)
list(APPEND SOURCE_FILES GPU/XpuptiApi.cpp)
endif()

add_proton_library(ProtonDriver ${SOURCE_FILES})
2 changes: 1 addition & 1 deletion third_party/proton/csrc/lib/Profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set(SOURCE_FILES
RocTracer/RoctracerProfiler.cpp
)

if(ENABLE_XPUTPI_PROFILER)
if(TRITON_BUILD_PROTON_XPU)
list(APPEND SOURCE_FILES Xpupti/XpuptiProfiler.cpp)
endif()

Expand Down
95 changes: 62 additions & 33 deletions third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@
#include "Utility/Map.h"

#include "pti/pti_view.h"
#include <cassert>
#include <cstring>
#include <level_zero/layers/zel_tracing_api.h>
#include <level_zero/zet_api.h>
#include <sycl/sycl.hpp>

#include <algorithm>
#include <array>
#include <dlfcn.h>

#include <cstdlib>
#include <iostream>
Expand All @@ -30,35 +35,6 @@ namespace {

std::vector<std::array<uint8_t, 16>> deviceUUIDs_ = {};

// FIXME: Should it be in DeviceInfo class?
// Inspired by Kineto: `XpuptiActivityProfiler.cpp`
void enumDeviceUUIDs() {
if (!deviceUUIDs_.empty()) {
return;
}
auto platform_list = sycl::platform::get_platforms();
// Enumerated GPU devices from the specific platform.
for (const auto &platform : platform_list) {
if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
continue;
}
auto device_list = platform.get_devices();
for (const auto &device : device_list) {
if (device.is_gpu()) {
if (device.has(sycl::aspect::ext_intel_device_info_uuid)) {
deviceUUIDs_.push_back(
device.get_info<sycl::ext::intel::info::device::uuid>());
} else {
std::cerr << "Warnings: UUID is not supported for this XPU device. "
"The device index of records will be 0."
<< std::endl;
deviceUUIDs_.push_back(std::array<uint8_t, 16>{});
}
}
}
}
}

uint8_t getDeviceIdxFromUUID(const uint8_t deviceUUID[16]) {
std::array<unsigned char, 16> key;
memcpy(key.data(), deviceUUID, 16);
Expand Down Expand Up @@ -353,9 +329,63 @@ void XpuptiProfiler::XpuptiProfilerPimpl::completeBuffer(uint8_t *buffer,

zel_tracer_handle_t tracer = nullptr;

typedef void (*EnumDeviceUUIDsFunc)(std::vector<std::array<uint8_t, 16>>);

int callEnumDeviceUUIDs(const std::string &utils_cache_path) {
void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY);
if (!handle) {
std::cerr << "Failed to load library: " << dlerror() << std::endl;
return 1;
}

dlerror();
EnumDeviceUUIDsFunc enumDeviceUUIDs =
(EnumDeviceUUIDsFunc)dlsym(handle, "enumDeviceUUIDs");
const char *dlsym_error = dlerror();
if (dlsym_error) {
std::cerr << "Failed to load function: " << dlsym_error << std::endl;
dlclose(handle);
return 1;
}

enumDeviceUUIDs(deviceUUIDs_);

dlclose(handle);
return 0;
}

typedef void (*WaitOnSyclQueueFunc)(void *);

int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY);
if (!handle) {
std::cerr << "Failed to load library: " << dlerror() << std::endl;
return 1;
}

dlerror();
WaitOnSyclQueueFunc waitOnSyclQueue =
(WaitOnSyclQueueFunc)dlsym(handle, "waitOnSyclQueue");
const char *dlsym_error = dlerror();
if (dlsym_error) {
std::cerr << "Failed to load function: " << dlsym_error << std::endl;
dlclose(handle);
return 1;
}

waitOnSyclQueue(syclQueue);

dlclose(handle);
return 0;
}

void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
// xpupti::subscribe<true>(&subscriber, callbackFn, nullptr);
enumDeviceUUIDs();
// should be call to shared lib
XpuptiProfiler &profiler = threadState.profiler;
if (profiler.utils_cache_path != "") {
callEnumDeviceUUIDs(profiler.utils_cache_path);
}
// auto res = ptiViewPushExternalCorrelationId(
// pti_view_external_kind::PTI_VIEW_EXTERNAL_KIND_CUSTOM_1, 42);
// std::cout << "res: " << res << "\n" << std::flush;
Expand Down Expand Up @@ -405,8 +435,7 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() {
std::cout << "flush\n" << std::flush;
XpuptiProfiler &profiler = threadState.profiler;
if (profiler.syclQueue != nullptr) {
sycl::queue *syclQueue = static_cast<sycl::queue *>(profiler.syclQueue);
syclQueue->wait();
callWaitOnSyclQueue(profiler.utils_cache_path, profiler.syclQueue);
}

profiler.correlation.flush(
Expand Down
Loading
Loading