diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index 8db34232502f7..337e68a15776d 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -199,8 +199,8 @@ else() endif() # Use the current compiler target to determine the appropriate runtime to build. -if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR - "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx") +if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64") add_subdirectory(device) else() add_subdirectory(module) diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt index 5722cea1bf9e2..0e0507b3d2103 100644 --- a/openmp/device/CMakeLists.txt +++ b/openmp/device/CMakeLists.txt @@ -87,7 +87,10 @@ if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx") set(target_name "nvptx") - list(APPEND compile_flags --cuda-feature=+ptx63) + list(APPEND compile_options --cuda-feature=+ptx63) +elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR + "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64") + set(target_name "spirv") endif() # Trick to combine these into a bitcode file via the linker's LTO pass. @@ -110,8 +113,15 @@ target_include_directories(libompdevice PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../libc ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include) target_compile_options(libompdevice PRIVATE ${compile_options} ${compile_flags}) -target_link_options(libompdevice PRIVATE - "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm") +if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND + NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv") + target_link_options(libompdevice PRIVATE + "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm") +else() + target_link_options(libompdevice PRIVATE + "-nostdlib" "-emit-llvm") +endif() + if(LLVM_DEFAULT_TARGET_TRIPLE) target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}") endif() diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h index f39c6d7f65702..56cd7a9a92eb4 100644 --- a/openmp/device/include/DeviceTypes.h +++ b/openmp/device/include/DeviceTypes.h @@ -132,7 +132,17 @@ struct IdentTy { using __kmpc_impl_lanemask_t = LaneMaskTy; -using ParallelRegionFnTy = void *; +#ifdef __SPIRV__ +// Function pointers in SPIRV backend have a special address space 9. +// Since function pointers are passed as regular void * pointers it is +// necessary to annotate them with proper address space to avoid casting +// errors during compilation. +using FnPtrTy = void [[clang::address_space(9)]] *; +#else +using FnPtrTy = void *; +#endif + +using ParallelRegionFnTy = FnPtrTy; using CriticalNameTy = int32_t[8]; diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h index b8a0da538d466..0273089d09261 100644 --- a/openmp/device/include/State.h +++ b/openmp/device/include/State.h @@ -220,7 +220,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { __builtin_unreachable(); } -[[gnu::always_inline, gnu::flatten]] inline void *& +[[gnu::always_inline, gnu::flatten]] inline FnPtrTy & lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { switch (Kind) { case state::VK_ParallelRegionFn: diff --git a/openmp/device/include/Synchronization.h b/openmp/device/include/Synchronization.h index d72f0c8a1696c..32c33cb4bf246 100644 --- a/openmp/device/include/Synchronization.h +++ b/openmp/device/include/Synchronization.h @@ -42,7 +42,20 @@ enum MemScopeTy { template > V inc(Ty *Address, V Val, atomic::OrderingTy Ordering, MemScopeTy MemScope = MemScopeTy::device) { +#if defined(__SPIRV__) + uint32_t Old; + while (true) { + Old = load(Address, Ordering, MemScope); + if (Old >= Val) { + if (cas(Address, Old, 0u, Ordering, Ordering, MemScope)) + break; + } else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope)) + break; + } + return Old; +#else return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope); +#endif } template > diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp index 24f989197a707..fdac0de32da3a 100644 --- a/openmp/device/src/Allocator.cpp +++ b/openmp/device/src/Allocator.cpp @@ -44,7 +44,8 @@ extern "C" { } #endif -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) +//#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) +#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC) [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } #else diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp index bd2c0799ee9f0..9f74990ce43ea 100644 --- a/openmp/device/src/Parallelism.cpp +++ b/openmp/device/src/Parallelism.cpp @@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { // Invoke an outlined parallel function unwrapping arguments (up to 32). [[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, + int32_t bound_tid, FnPtrTy fn, void **args, int64_t nargs) { switch (nargs) { #include "generated_microtask_cases.gen" @@ -84,7 +84,7 @@ extern "C" { [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, - void *fn, void **args, + FnPtrTy fn, void **args, const int64_t nargs) { uint32_t TId = mapping::getThreadIdInBlock(); uint32_t NumThreads = determineNumberOfThreads(num_threads); @@ -142,8 +142,8 @@ extern "C" { [[clang::always_inline]] void __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr, - int32_t num_threads, int proc_bind, void *fn, - void *wrapper_fn, void **args, int64_t nargs, + int32_t num_threads, int proc_bind, FnPtrTy fn, + FnPtrTy wrapper_fn, void **args, int64_t nargs, int32_t nt_strict) { uint32_t TId = mapping::getThreadIdInBlock(); @@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr, 1u, true, ident, /*ForceTeamState=*/true); state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, - (void *)nullptr, true, ident, + (FnPtrTy) nullptr, true, ident, /*ForceTeamState=*/true); state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, /*ForceTeamState=*/true); diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp index af522bf66b35a..88de9442a558c 100644 --- a/openmp/device/src/Synchronization.cpp +++ b/openmp/device/src/Synchronization.cpp @@ -232,6 +232,37 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } #endif ///} +#if defined(__SPIRV__) +void namedBarrierInit() { __builtin_trap(); } // TODO +void namedBarrier() { __builtin_trap(); } // TODO + +void unsetLock(omp_lock_t *Lock) { + atomic::store((int32_t *)Lock, 0, atomic::seq_cst); +} +int testLock(omp_lock_t *Lock) { + return atomic::add((int32_t *)Lock, 0, atomic::seq_cst); +} +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } +void setLock(omp_lock_t *Lock) { + int32_t *Lock_ptr = (int32_t *)Lock; + bool Acquired = false; + int32_t Expected; + while (!Acquired) { + Expected = 0; + if (Expected == atomic::load(Lock_ptr, atomic::seq_cst)) + Acquired = + atomic::cas(Lock_ptr, Expected, 1, atomic::seq_cst, atomic::seq_cst); + } +} + +void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } +void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } +void syncThreadsAligned(atomic::OrderingTy Ordering) { + synchronize::threads(Ordering); +} +#endif + } // namespace impl void synchronize::init(bool IsSPMD) {