Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
996c2e1
Build DeviceRTL with spirv backend
fineg74 Nov 25, 2025
5e746a8
Merge branch 'main' into l0RTL
fineg74 Dec 7, 2025
3587050
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 6, 2026
84def59
Address PR comments
fineg74 Jan 7, 2026
5b6f225
Fix formatting issues
fineg74 Jan 7, 2026
dc78bad
Address PR comments
fineg74 Jan 8, 2026
e98cf56
Fix formatting
fineg74 Jan 9, 2026
40dfa28
Allow build of DeviceRTL for SPIRV backend
fineg74 Jan 9, 2026
2922a82
Fix format
fineg74 Jan 9, 2026
a18c339
Fix formatting
fineg74 Jan 9, 2026
ae099ba
Fix format
fineg74 Jan 9, 2026
db04823
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 9, 2026
802a65e
Add TODO comment
fineg74 Jan 9, 2026
f2dbfa2
Fix format
fineg74 Jan 9, 2026
101fccd
Fix test failure
fineg74 Jan 9, 2026
7584cad
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 9, 2026
a660034
Reenable attributes after backend fixes
fineg74 Jan 9, 2026
b79e42a
Address PR comments
fineg74 Jan 9, 2026
488209c
Address PR comments
fineg74 Jan 10, 2026
c5ac596
Addressed PR comments
fineg74 Jan 10, 2026
6366e0c
Address PR comments
fineg74 Jan 10, 2026
e090887
Update openmp/device/CMakeLists.txt
fineg74 Jan 10, 2026
d50579f
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 19, 2026
7cc3270
Update the branch with support for variadics
fineg74 Jan 19, 2026
af38777
REmove spurious changes
fineg74 Jan 19, 2026
4bd07fd
Applied different work around for a build break
fineg74 Jan 19, 2026
4ad7138
Fix formatting
fineg74 Jan 19, 2026
e66a4e4
Address PR comments
fineg74 Jan 21, 2026
867d96a
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 23, 2026
f92b77e
Address PR comments
fineg74 Jan 24, 2026
7e5e695
Fix formatting
fineg74 Jan 24, 2026
bb719be
Use c++ style attributes
fineg74 Jan 24, 2026
b9875fd
Merge remote-tracking branch 'origin/main' into l0RTL
fineg74 Jan 26, 2026
e2cc882
Cleanup based on trunk rebase
fineg74 Jan 26, 2026
5e20646
Fix formatting
fineg74 Jan 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions openmp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ else()
endif()

# Use the current compiler target to determine the appropriate runtime to build.
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64")
add_subdirectory(device)
else()
add_subdirectory(module)
Expand Down
15 changes: 13 additions & 2 deletions openmp/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ list(APPEND compile_options -fno-rtti)
list(APPEND compile_options -fno-exceptions)
list(APPEND compile_options -fconvergent-functions)
list(APPEND compile_options -Wno-unknown-cuda-version)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unnecessary reordering

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

if(LLVM_DEFAULT_TARGET_TRIPLE)
list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
endif()
Expand All @@ -52,6 +53,9 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
set(target_name "nvptx")
list(APPEND compile_options --cuda-feature=+ptx63)
elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
set(target_name "spirv")
endif()

# Trick to combine these into a bitcode file via the linker's LTO pass.
Expand All @@ -74,8 +78,15 @@ target_include_directories(libompdevice PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../libc
${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
target_compile_options(libompdevice PRIVATE ${compile_options})
target_link_options(libompdevice PRIVATE
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
target_link_options(libompdevice PRIVATE
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
else()
target_link_options(libompdevice PRIVATE
"-nostdlib" "-emit-llvm")
endif()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why aren't we building the device RTL in LTO, as we do for the rest?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SPIRV backend doesn't support lto

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, what is the issue? We rely on LTO for performance, and without I doubt we can be competitive.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The library will get linked in and optimized somehow I'm sure, but the library itself gets no individual optimization step together. That definitely hurts performance, but the individual files are optimized so it's likely not catastrophic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is currently SPIR-V uses an out of tree linker spirv-link, and nobody hooked up the driver to call lld instead of spirv-link if LTO is enabled. I will try to tackle that after I finish the couple of tasks I have now, but yeah right now LTO isn't hooked up.


if(LLVM_DEFAULT_TARGET_TRIPLE)
target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
endif()
Expand Down
12 changes: 11 additions & 1 deletion openmp/device/include/DeviceTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,17 @@ struct IdentTy {

using __kmpc_impl_lanemask_t = LaneMaskTy;

using ParallelRegionFnTy = void *;
#ifdef __SPIRV__
// Function pointers in SPIRV backend have a special address space 9.
// Since function pointers are passed as regular void * pointers it is
// necessary to annotate them with proper address space to avoid casting
// errors during compilation.
using FnPtrTy = void [[clang::address_space(9)]] *;
#else
using FnPtrTy = void *;
#endif

using ParallelRegionFnTy = FnPtrTy;

using CriticalNameTy = int32_t[8];

Expand Down
2 changes: 1 addition & 1 deletion openmp/device/include/State.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
__builtin_unreachable();
}

[[gnu::always_inline, gnu::flatten]] inline void *&
[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:
Expand Down
13 changes: 13 additions & 0 deletions openmp/device/include/Synchronization.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,20 @@ enum MemScopeTy {
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
MemScopeTy MemScope = MemScopeTy::device) {
#if defined(__SPIRV__)
uint32_t Old;
while (true) {
Old = load(Address, Ordering, MemScope);
if (Old >= Val) {
if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
break;
} else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
break;
}
return Old;
#else
return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
#endif
}

template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
Expand Down
2 changes: 1 addition & 1 deletion openmp/device/src/Allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ using namespace allocator;
// Provide a default implementation of malloc / free for AMDGPU platforms built
// without 'libc' support.
extern "C" {
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
#else
Expand Down
10 changes: 5 additions & 5 deletions openmp/device/src/Parallelism.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {

// Invoke an outlined parallel function unwrapping arguments (up to 32).
[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
int32_t bound_tid, void *fn,
int32_t bound_tid, FnPtrTy fn,
void **args, int64_t nargs) {
switch (nargs) {
#include "generated_microtask_cases.gen"
Expand All @@ -84,7 +84,7 @@ extern "C" {

[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
int32_t num_threads,
void *fn, void **args,
FnPtrTy fn, void **args,
const int64_t nargs) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t NumThreads = determineNumberOfThreads(num_threads);
Expand Down Expand Up @@ -142,8 +142,8 @@ extern "C" {

[[clang::always_inline]] void
__kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
int32_t num_threads, int proc_bind, void *fn,
void *wrapper_fn, void **args, int64_t nargs,
int32_t num_threads, int proc_bind, FnPtrTy fn,
FnPtrTy wrapper_fn, void **args, int64_t nargs,
int32_t nt_strict) {
uint32_t TId = mapping::getThreadIdInBlock();

Expand Down Expand Up @@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
1u, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
(void *)nullptr, true, ident,
(FnPtrTy) nullptr, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
/*ForceTeamState=*/true);
Expand Down
31 changes: 31 additions & 0 deletions openmp/device/src/Synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,37 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
#endif
///}

#if defined(__SPIRV__)
void namedBarrierInit() { __builtin_trap(); } // TODO
void namedBarrier() { __builtin_trap(); } // TODO

void unsetLock(omp_lock_t *Lock) {
atomic::store((int32_t *)Lock, 0, atomic::seq_cst);
}
int testLock(omp_lock_t *Lock) {
return atomic::add((int32_t *)Lock, 0, atomic::seq_cst);
}
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setLock(omp_lock_t *Lock) {
int32_t *Lock_ptr = (int32_t *)Lock;
bool Acquired = false;
int32_t Expected;
while (!Acquired) {
Expected = 0;
if (Expected == atomic::load(Lock_ptr, atomic::seq_cst))
Acquired =
atomic::cas(Lock_ptr, Expected, 1, atomic::seq_cst, atomic::seq_cst);
}
}

void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
void syncThreadsAligned(atomic::OrderingTy Ordering) {
synchronize::threads(Ordering);
}
#endif

} // namespace impl

void synchronize::init(bool IsSPMD) {
Expand Down
Loading