Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
74946b2
Migrate Proton ROCm backend from roctracer to rocprofiler-sdk
Mar 12, 2026
144f7ff
Apply pre-commit formatting fixes (clang-format, yapf)
ZelboK Mar 12, 2026
11b8177
graph destroy clean up, selective hip api subscription(may need audit…
ZelboK Mar 12, 2026
8617215
see if ci fixed
ZelboK Mar 12, 2026
054b2a9
Add roctracer fallback and address comments
ZelboK Mar 17, 2026
364f92c
camelCase format
ZelboK Mar 17, 2026
ba5dd3f
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK Apr 13, 2026
6815167
check if ci passes by resolving inner scopes
ZelboK Apr 13, 2026
80bf33f
simplify runtime loading and tid error
ZelboK Apr 13, 2026
2c8474e
fix tid error not being unique because of hsa queue reuse. fix failin…
ZelboK Apr 13, 2026
e8ad0e5
Merge remote-tracking branch 'origin/main' into feat/rocprofiler_sdk_…
ZelboK Apr 14, 2026
1bab628
bundle required types instead of conditional compile
ZelboK Apr 14, 2026
06440ee
clean up types, remove dead code, and add new header for roctx type
ZelboK Apr 14, 2026
ad6d261
clean up
ZelboK Apr 15, 2026
a3a7c56
test if ci passes 7.12
ZelboK Apr 16, 2026
9c08d44
ci test
ZelboK Apr 16, 2026
5c74b70
delete later doing retry for ci
ZelboK Apr 16, 2026
f46ad32
ci test
ZelboK Apr 16, 2026
4995121
test if ci fixes
ZelboK Apr 16, 2026
d1ad7b5
uncomment changes on job
ZelboK Apr 16, 2026
76e1738
fix mac ci
ZelboK Apr 17, 2026
f54adb3
address comments and lean up ci
ZelboK Apr 24, 2026
35ee378
ci and revert is_active
ZelboK Apr 24, 2026
2b6e384
test ci
ZelboK Apr 29, 2026
a1f6aae
merge
ZelboK Apr 29, 2026
c282886
test ci revert later
ZelboK Apr 30, 2026
2696b97
test to see if scan is still needed and others are needed
ZelboK Apr 30, 2026
cf34496
increase timeout
ZelboK Apr 30, 2026
e5f4ea6
revert comment out ci
ZelboK Apr 30, 2026
7bed3d5
Merge branch 'main' into feat/rocprofiler_sdk_late_start
Jokeren May 1, 2026
65e9ba8
clean up loading lib
ZelboK May 1, 2026
f7b63ce
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 1, 2026
20f4a72
test ci
ZelboK May 1, 2026
fc73352
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 1, 2026
9dc6eab
fix CI to gfx90a default roctracer
ZelboK May 1, 2026
2a395c1
Properly set cdna2 defaults
antiagainst May 1, 2026
b088282
Revert "test ci"
antiagainst May 1, 2026
054a327
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 1, 2026
581fd2a
fix CI for gfx950 and 942
ZelboK May 1, 2026
5ab188f
check if this fixes ci...
ZelboK May 2, 2026
fcb79a4
make roctracer default test for 7.0
ZelboK May 2, 2026
d61bce2
test chatgpt solution for ci.
ZelboK May 2, 2026
1f1e4ab
test for ci
ZelboK May 4, 2026
9bd6f9c
test
ZelboK May 4, 2026
b0e421a
check if loading is prob
ZelboK May 4, 2026
5a04d4d
test if CI passes...
ZelboK May 4, 2026
f240879
hip device ordinal bug
ZelboK May 4, 2026
92d774c
undo skipping parts of CI
ZelboK May 4, 2026
225dfcb
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 4, 2026
8d89c1b
ci failures nad merge main
ZelboK May 12, 2026
db221fa
add comment
ZelboK May 12, 2026
5fec788
precommit
ZelboK May 13, 2026
9c35ed3
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 13, 2026
b60e56f
aquick test, refine later to see if ci fixes for amd
ZelboK May 13, 2026
51e95a7
temporarily check if we can repro on nvidia ci with new test(del later?)
ZelboK May 13, 2026
885f7b2
simplify code
ZelboK May 13, 2026
4bf559d
remoev test
ZelboK May 13, 2026
73c3479
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 13, 2026
d924790
simplify
ZelboK May 14, 2026
0edf862
Merge branch 'main' into feat/rocprofiler_sdk_late_start
ZelboK May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions .github/workflows/integration-tests-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,115 @@ jobs:
run: |
rm -rf ~/.triton/cache
rm -rf ~/.ccache

proton-tests-amd-rocm712:
if: ${{ always() && github.repository == 'triton-lang/triton' }}
needs: integration-tests-amd
name: proton-tests-amd (gfx950-rocm712)
runs-on: ["amd-gfx950"]
timeout-minutes: 25
env:
TRITON_BUILD_WITH_CCACHE: "true"
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
TRITON_DISABLE_LINE_INFO: 1
PROTON_SKIP_PC_SAMPLING_TEST: 1
PYTHON: "python3"
CCACHE_COMPRESS: "true"
PIP_BREAK_SYSTEM_PACKAGES: 1
container:
image: rocm/vllm:rocm7.12.0_gfx950-dcgpu_ubuntu24.04_py3.12_pytorch_2.9.1_vllm_0.16.0
options: >-
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
--env-file /etc/podinfo/gha-gpu-isolation-settings
--volume /home/runner/.triton:/github/home/.triton
--volume /triton-data:/triton-data
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: 'true'
- name: Compute cache keys
id: cache-key
run: |
llvm_file="cmake/llvm-hash.txt"
nvidia_file="cmake/nvidia-toolchain-version.json"
json_file="cmake/json-version.txt"

if [[ ! -f "$llvm_file" || ! -f "$nvidia_file" || ! -f "$json_file" ]]; then
echo "Error: Required dependency files are missing."
exit 1
fi

echo "llvm=$(cat $llvm_file | cut -c 1-8)" >> $GITHUB_OUTPUT
echo "nvidia=$(sha256sum $nvidia_file | cut -d ' ' -f 1)" >> $GITHUB_OUTPUT
echo "json=$(cat $json_file)" >> $GITHUB_OUTPUT
shell: bash
- name: Cache build dependencies
uses: actions/cache@v4
with:
path: |
~/.triton/llvm
~/.triton/nvidia
~/.triton/json
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
- name: Install dependencies
run: |
for i in 1 2 3; do
apt-get -o Acquire::Retries=5 update && break
echo "apt-get update attempt $i failed, retrying in 10s..."
sleep 10
done
apt-get install -y clang lld ccache
command -v clang && command -v lld && command -v ccache
- name: Inspect cache directories
run: |
mkdir -p ~/.triton
du -h -d 1 ~/.triton

mkdir -p ~/.ccache
du -h -d 1 ~/.ccache
- name: Update compiler to Clang
run: |
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
- name: Install Triton
run: |
echo "PATH is '$PATH'"
pip uninstall -y triton pytorch-triton-rocm

ccache --zero-stats
pip install --cache-dir /triton-data/pip-cache -r python/requirements.txt
pip install --cache-dir /triton-data/pip-cache -r python/test-requirements.txt
make dev-install
- name: Print ccache stats
run: ccache --print-stats
- name: Run Proton tests
run: |
unset HIP_VISIBLE_DEVICES
unset ROCR_VISIBLE_DEVICES
ROCM_SDK_LIB="$(python3 -c 'import _rocm_sdk_core, os; print(os.path.join(os.path.dirname(_rocm_sdk_core.__file__), "lib"))')"
echo "ROCM_SDK_LIB=$ROCM_SDK_LIB"
for base in libamdhip64 librocprofiler-sdk librocprofiler-sdk-attach; do
if [ ! -e "$ROCM_SDK_LIB/${base}.so" ]; then
versioned="$(ls "$ROCM_SDK_LIB"/${base}.so.* 2>/dev/null | sort -V | head -1 || true)"
if [ -n "$versioned" ]; then
ln -sf "$(basename "$versioned")" "$ROCM_SDK_LIB/${base}.so"
echo "linked $ROCM_SDK_LIB/${base}.so -> $(basename "$versioned")"
fi
fi
done
export LD_LIBRARY_PATH="$ROCM_SDK_LIB:$LD_LIBRARY_PATH"
make test-proton
- name: Inspect cache directories
run: |
mkdir -p ~/.triton
du -h -d 1 ~/.triton

mkdir -p ~/.ccache
du -h -d 1 ~/.ccache
- name: Clean up caches
if: always()
run: |
rm -rf ~/.triton/cache
rm -rf ~/.ccache
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,10 @@ def get_proton_cmake_args(self):
if cupti_include_dir == "":
cupti_include_dir = os.path.join(get_base_dir(), "third_party", "nvidia", "backend", "include")
cmake_args += ["-DCUPTI_INCLUDE_DIR=" + cupti_include_dir]
roctracer_include_dir = get_env_with_keys(["TRITON_ROCTRACER_INCLUDE_PATH"])
if roctracer_include_dir == "":
roctracer_include_dir = os.path.join(get_base_dir(), "third_party", "amd", "backend", "include")
cmake_args += ["-DROCTRACER_INCLUDE_DIR=" + roctracer_include_dir]
rocm_include_dir = get_env_with_keys(["TRITON_ROCM_INCLUDE_PATH"])
if rocm_include_dir == "":
rocm_include_dir = os.path.join(get_base_dir(), "third_party", "amd", "backend", "include")
cmake_args += ["-DROCM_INCLUDE_DIR=" + rocm_include_dir]
return cmake_args

def build_extension(self, ext):
Expand Down
6 changes: 3 additions & 3 deletions third_party/proton/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ set(PROTON_COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common")
if(NOT CUPTI_INCLUDE_DIR)
message(FATAL_ERROR "CUPTI include directory not defined")
endif()
if(NOT ROCTRACER_INCLUDE_DIR)
message(FATAL_ERROR "ROCTRACER include directory not defined")
if(NOT ROCM_INCLUDE_DIR)
message(FATAL_ERROR "ROCM include directory not defined")
endif()
if(NOT JSON_INCLUDE_DIR)
message(FATAL_ERROR "JSON include directory not defined")
Expand All @@ -30,7 +30,7 @@ function(add_proton_library name)
# Use system to skip warnings caused by legacy clang compilers
target_include_directories(${name}
SYSTEM PRIVATE
"${ROCTRACER_INCLUDE_DIR}"
"${ROCM_INCLUDE_DIR}"
)

target_include_directories(${name}
Expand Down
88 changes: 88 additions & 0 deletions third_party/proton/csrc/include/Driver/GPU/RocprofApi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#ifndef PROTON_DRIVER_GPU_ROCPROFILER_API_H_
#define PROTON_DRIVER_GPU_ROCPROFILER_API_H_

#include "Driver/Dispatch.h"
#include "rocprofiler-sdk/agent.h"
#include "rocprofiler-sdk/buffer.h"
#include "rocprofiler-sdk/buffer_tracing.h"
#include "rocprofiler-sdk/callback_tracing.h"
#include "rocprofiler-sdk/fwd.h"
#include "rocprofiler-sdk/hip/api_args.h"
#include "rocprofiler-sdk/hip/runtime_api_id.h"
#include "rocprofiler-sdk/internal_threading.h"
#include "rocprofiler-sdk/registration.h"

namespace proton {

namespace rocprofiler {

struct ExternLibRocprofiler : public ExternLibBase {
using RetType = rocprofiler_status_t;
static constexpr const char *name = "librocprofiler-sdk.so";
static constexpr const char *symbolName = "rocprofiler_is_initialized";
static constexpr const char *pathEnv = "TRITON_ROCPROFILER_SDK_LIB_PATH";
static constexpr RetType success = ROCPROFILER_STATUS_SUCCESS;
static inline void *lib = nullptr;
};

template <bool CheckSuccess> rocprofiler_status_t isInitialized(int *status);

template <bool CheckSuccess>
rocprofiler_status_t forceConfigure(rocprofiler_configure_func_t configureFunc);

template <bool CheckSuccess>
rocprofiler_status_t createContext(rocprofiler_context_id_t *context);

template <bool CheckSuccess>
rocprofiler_status_t destroyContext(rocprofiler_context_id_t context);

template <bool CheckSuccess>
rocprofiler_status_t startContext(rocprofiler_context_id_t context);

template <bool CheckSuccess>
rocprofiler_status_t stopContext(rocprofiler_context_id_t context);

template <bool CheckSuccess>
rocprofiler_status_t
createBuffer(rocprofiler_context_id_t context, size_t size, size_t watermark,
rocprofiler_buffer_policy_t policy,
rocprofiler_buffer_tracing_cb_t callback, void *userData,
rocprofiler_buffer_id_t *buffer);

template <bool CheckSuccess>
rocprofiler_status_t destroyBuffer(rocprofiler_buffer_id_t buffer);

template <bool CheckSuccess>
rocprofiler_status_t flushBuffer(rocprofiler_buffer_id_t buffer);

template <bool CheckSuccess>
rocprofiler_status_t configureBufferTracingService(
rocprofiler_context_id_t context, rocprofiler_buffer_tracing_kind_t kind,
const rocprofiler_tracing_operation_t *operations, size_t operationCount,
rocprofiler_buffer_id_t buffer);

template <bool CheckSuccess>
rocprofiler_status_t configureCallbackTracingService(
rocprofiler_context_id_t context, rocprofiler_callback_tracing_kind_t kind,
const rocprofiler_tracing_operation_t *operations, size_t operationCount,
rocprofiler_callback_tracing_cb_t callback, void *userData);

template <bool CheckSuccess>
rocprofiler_status_t
createCallbackThread(rocprofiler_callback_thread_t *thread);

template <bool CheckSuccess>
rocprofiler_status_t assignCallbackThread(rocprofiler_buffer_id_t buffer,
rocprofiler_callback_thread_t thread);

template <bool CheckSuccess>
rocprofiler_status_t
queryAvailableAgents(rocprofiler_agent_version_t version,
rocprofiler_query_available_agents_cb_t callback,
size_t agentSize, void *userData);

} // namespace rocprofiler

} // namespace proton

#endif // PROTON_DRIVER_GPU_ROCPROFILER_API_H_
5 changes: 1 addition & 4 deletions third_party/proton/csrc/include/Profiler/GPUProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ namespace detail {

void flushDataPhasesImpl(
const bool periodicFlushEnabled, const std::string &periodicFlushingFormat,
std::map<Data *, size_t> &dataFlushedPhases,
const std::map<Data *,
std::pair</*start_phase=*/size_t, /*end_phase=*/size_t>>
&dataPhases,
Expand Down Expand Up @@ -93,14 +92,12 @@ class GPUProfiler : public Profiler,
}

void flushDataPhases(
std::map<Data *, size_t> &dataFlushedPhases,
const std::map<Data *,
std::pair</*start_phase=*/size_t, /*end_phase=*/size_t>>
&dataPhases,
PendingGraphPool *pendingGraphPool) {
detail::flushDataPhasesImpl(periodicFlushingEnabled, periodicFlushingFormat,
dataFlushedPhases, dataPhases,
pendingGraphPool);
dataPhases, pendingGraphPool);
}

// Profiler
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef PROTON_PROFILER_ROCPROFSDK_PROFILER_H_
#define PROTON_PROFILER_ROCPROFSDK_PROFILER_H_

#include "Profiler/GPUProfiler.h"

namespace proton {

class RocprofSDKProfiler : public GPUProfiler<RocprofSDKProfiler> {
public:
RocprofSDKProfiler();
virtual ~RocprofSDKProfiler();

struct RocprofSDKProfilerPimpl;

private:
virtual void
doSetMode(const std::vector<std::string> &modeAndOptions) override;
};

} // namespace proton

#endif // PROTON_PROFILER_ROCPROFSDK_PROFILER_H_
3 changes: 2 additions & 1 deletion third_party/proton/csrc/lib/Driver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_proton_library(ProtonDriver
GPU/CuptiApi.cpp
GPU/HipApi.cpp
GPU/HsaApi.cpp
GPU/RoctracerApi.cpp
GPU/NvtxApi.cpp
GPU/RoctracerApi.cpp
GPU/RocprofApi.cpp
)
60 changes: 60 additions & 0 deletions third_party/proton/csrc/lib/Driver/GPU/RocprofApi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "Driver/GPU/RocprofApi.h"

namespace proton {
namespace rocprofiler {

DEFINE_DISPATCH(ExternLibRocprofiler, isInitialized, rocprofiler_is_initialized,
int *)

DEFINE_DISPATCH(ExternLibRocprofiler, forceConfigure,
rocprofiler_force_configure, rocprofiler_configure_func_t)

DEFINE_DISPATCH(ExternLibRocprofiler, createContext, rocprofiler_create_context,
rocprofiler_context_id_t *)

DEFINE_DISPATCH(ExternLibRocprofiler, destroyContext,
rocprofiler_destroy_context, rocprofiler_context_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, startContext, rocprofiler_start_context,
rocprofiler_context_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, stopContext, rocprofiler_stop_context,
rocprofiler_context_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, createBuffer, rocprofiler_create_buffer,
rocprofiler_context_id_t, size_t, size_t,
rocprofiler_buffer_policy_t, rocprofiler_buffer_tracing_cb_t,
void *, rocprofiler_buffer_id_t *)

DEFINE_DISPATCH(ExternLibRocprofiler, destroyBuffer, rocprofiler_destroy_buffer,
rocprofiler_buffer_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, flushBuffer, rocprofiler_flush_buffer,
rocprofiler_buffer_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, configureBufferTracingService,
rocprofiler_configure_buffer_tracing_service,
rocprofiler_context_id_t, rocprofiler_buffer_tracing_kind_t,
const rocprofiler_tracing_operation_t *, size_t,
rocprofiler_buffer_id_t)

DEFINE_DISPATCH(ExternLibRocprofiler, configureCallbackTracingService,
rocprofiler_configure_callback_tracing_service,
rocprofiler_context_id_t, rocprofiler_callback_tracing_kind_t,
const rocprofiler_tracing_operation_t *, size_t,
rocprofiler_callback_tracing_cb_t, void *)

DEFINE_DISPATCH(ExternLibRocprofiler, createCallbackThread,
rocprofiler_create_callback_thread,
rocprofiler_callback_thread_t *)

DEFINE_DISPATCH(ExternLibRocprofiler, assignCallbackThread,
rocprofiler_assign_callback_thread, rocprofiler_buffer_id_t,
rocprofiler_callback_thread_t)

DEFINE_DISPATCH(ExternLibRocprofiler, queryAvailableAgents,
rocprofiler_query_available_agents, rocprofiler_agent_version_t,
rocprofiler_query_available_agents_cb_t, size_t, void *)

} // namespace rocprofiler
} // namespace proton
1 change: 1 addition & 0 deletions third_party/proton/csrc/lib/Profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ add_proton_library(ProtonProfiler
Cupti/CuptiPCSampling.cpp
Cupti/CuptiProfiler.cpp
RocTracer/RoctracerProfiler.cpp
RocprofSDK/RocprofSDKProfiler.cpp
Instrumentation/InstrumentationProfiler.cpp
Instrumentation/Metadata.cpp
)
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,6 @@ void CuptiProfiler::CuptiProfilerPimpl::completeBuffer(CUcontext ctx,
size_t validSize) {
CuptiProfiler &profiler = threadState.profiler;
uint32_t maxCorrelationId = 0;
static thread_local std::map<Data *, size_t> dataFlushedPhases;
std::map<Data *, std::pair<size_t, size_t>> dataPhases;
CUptiResult status;
CUpti_Activity *activity = nullptr;
Expand All @@ -456,8 +455,7 @@ void CuptiProfiler::CuptiProfilerPimpl::completeBuffer(CUcontext ctx,
std::free(buffer);

profiler.correlation.complete(maxCorrelationId);
profiler.flushDataPhases(dataFlushedPhases, dataPhases,
profiler.pendingGraphPool.get());
profiler.flushDataPhases(dataPhases, profiler.pendingGraphPool.get());
}

void CuptiProfiler::CuptiProfilerPimpl::handleGraphResourceCallbacks(
Expand Down
Loading
Loading