-
Notifications
You must be signed in to change notification settings - Fork 16k
[OFFLOAD] Build DeviceRTL with SPIRV backend #174675
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be notified. If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
|
@llvm/pr-subscribers-backend-x86 Author: None (fineg74) ChangesThis PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs Patch is 22.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174675.diff 11 Files Affected:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
gpuintrin.h
nvptxintrin.h
amdgpuintrin.h
+ spirvintrin.h
)
set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..8b75cc14878e3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -18,7 +18,7 @@
#define __GPUINTRIN_H
#if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__HIP__) || defined(__CUDA__) || defined(__SPIRV__)
#define _DEFAULT_FN_ATTRS __attribute__((device))
#else
#define _DEFAULT_FN_ATTRS
@@ -62,6 +62,8 @@ _Pragma("omp end declare target");
#include <amdgpuintrin.h>
#elif !defined(_OPENMP)
#error "This header is only meant to be used on GPU architectures."
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
#endif
_Pragma("omp begin declare target device_type(nohost)");
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..e66a2bf0767a6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,235 @@
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIRV targets or offloading to SPIRV"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+#include <stdint.h>
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#define true 1
+#define false 0
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+//
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((spirv_kernel, visibility("protected")))
+#define __SPIRV_VAR_QUALIFIERS extern const
+// Workgroup and invocation ID functions
+uint64_t __spirv_BuiltInNumWorkgroups(int i);
+uint64_t __spirv_BuiltInWorkgroupId(int i);
+uint64_t __spirv_BuiltInWorkgroupSize(int i);
+uint64_t __spirv_BuiltInLocalInvocationId(int i);
+
+typedef enum {
+ CrossDevice = 0,
+ Device = 1,
+ Workgroup = 2,
+ Subgroup = 3,
+ Invocation = 4
+} Scope_t;
+
+typedef enum {
+ Relaxed = 0x0,
+ Acquire = 0x2,
+ Release = 0x4,
+ AcquireRelease = 0x8,
+ SequentiallyConsistent = 0x10
+} MemorySemantics_t;
+
+using unsigned ProgramAS = 9;
+
+#ifdef __cplusplus
+template <typename... Args> int __spirv_ocl_printf(Args...);
+#endif
+
+// Subgroup
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
+
+// Group non-uniform operations
+uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope,
+ bool predicate);
+uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope,
+ uint32_t value);
+uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope,
+ uint32_t value, uint32_t id);
+
+// Synchronization
+void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope,
+ uint32_t semantics);
+void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
+
+// Atomic
+uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
+void __spirv_AtomicStore(int32_t *, int, int, int);
+int32_t __spirv_AtomicLoad(int32_t *, int, int);
+int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
+
+
+// Returns the number of blocks in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+ return __spirv_BuiltInNumWorkgroups(0);
+}
+
+// Returns the number of blocks in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+ return __spirv_BuiltInNumWorkgroups(1);
+}
+
+// Returns the number of blocks in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+ return __spirv_BuiltInNumWorkgroups(2);
+}
+
+// Returns the 'x' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+ return __spirv_BuiltInWorkgroupId(0);
+}
+
+// Returns the 'y' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+ return __spirv_BuiltInWorkgroupId(1);
+}
+
+// Returns the 'z' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+ return __spirv_BuiltInWorkgroupId(2);
+}
+
+// Returns the number of threads in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+ return __spirv_BuiltInWorkgroupSize(0);
+}
+
+// Returns the number of threads in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+ return __spirv_BuiltInWorkgroupSize(1);
+}
+
+// Returns the number of threads in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+ return __spirv_BuiltInWorkgroupSize(2);
+}
+
+// Returns the 'x' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+ return __spirv_BuiltInLocalInvocationId(0);
+}
+
+// Returns the 'y' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+ return __spirv_BuiltInLocalInvocationId(1);
+}
+
+// Returns the 'z' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+ return __spirv_BuiltInLocalInvocationId(2);
+}
+
+// Returns the size of a warp, always 32 on NVIDIA hardware.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+ return __spirv_BuiltInSubgroupSize;
+}
+
+// Returns the id of the thread inside of a warp executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+ return __spirv_BuiltInSubgroupLocalInvocationId;
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+ uint32_t Size = __gpu_num_lanes();
+ return ((uint64_t)1 << Size) - (uint64_t)1;
+}
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __spirv_GroupNonUniformBroadcastFirst(3, __x);
+}
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+ bool __x) {
+ uint64_t ballot = __spirv_GroupNonUniformBallot(3, __x);
+ return __lane_mask & ballot;
+}
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+ __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+ 0x100 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+ __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+ 0x80 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __spirv_GroupNonUniformShuffle(3, __x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Returns true if the flat pointer points to 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+ return false; // TODO
+}
+// Returns true if the flat pointer points to 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+ return false; // TODO
+}
+// Terminates execution of the calling thread.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+}
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+#endif // __SPIRVINTRIN_H
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..0dc43ac034225 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -25,14 +25,18 @@ set(src_files
${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
)
-list(APPEND compile_options -flto)
-list(APPEND compile_options -fvisibility=hidden)
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+ NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+ list(APPEND compile_options -flto)
+ list(APPEND compile_options -fvisibility=hidden)
+ list(APPEND compile_options -Wno-unknown-cuda-version)
+endif()
list(APPEND compile_options -nogpulib)
list(APPEND compile_options -nostdlibinc)
list(APPEND compile_options -fno-rtti)
list(APPEND compile_options -fno-exceptions)
list(APPEND compile_options -fconvergent-functions)
-list(APPEND compile_options -Wno-unknown-cuda-version)
+
if(LLVM_DEFAULT_TARGET_TRIPLE)
list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
endif()
@@ -43,7 +47,7 @@ endif()
# instructions yet and we end up missing out on way more important constant
# propagation. That said, we will run the vectorizer again after the runtime
# has been linked into the user program.
-list(APPEND compile_options "SHELL: -mllvm -vectorize-slp=false")
+list(APPEND compile_options -mllvm -vectorize-slp=false)
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
set(target_name "amdgpu")
@@ -52,6 +56,10 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
set(target_name "nvptx")
list(APPEND compile_options --cuda-feature=+ptx63)
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
+ "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
+ set(target_name "spirv")
+ list(APPEND compile_options -emit-llvm -c)
endif()
# Trick to combine these into a bitcode file via the linker's LTO pass.
@@ -62,26 +70,32 @@ set_target_properties(libompdevice PROPERTIES
BUILD_RPATH ""
INSTALL_RPATH ""
RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(TARGET libc)
- target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
-endif()
-target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-
-target_include_directories(libompdevice PRIVATE
- ${CMAKE_CURRENT_SOURCE_DIR}/include
- ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
- ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
-target_compile_options(libompdevice PRIVATE ${compile_options})
-target_link_options(libompdevice PRIVATE
+
+ # If the user built with the GPU C library enabled we will use that instead.
+ if(TARGET libc)
+ target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+ endif()
+ target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+
+ target_include_directories(libompdevice PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+ target_compile_options(libompdevice PRIVATE ${compile_options})
+ if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+ NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+ target_link_options(libompdevice PRIVATE
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
-if(LLVM_DEFAULT_TARGET_TRIPLE)
- target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
-endif()
-install(TARGETS libompdevice
- PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
- DESTINATION ${OPENMP_INSTALL_LIBDIR})
+ else()
+ target_link_options(libompdevice PRIVATE
+ "-nostdlib" "-emit-llvm" "-Wl")
+ endif()
+ if(LLVM_DEFAULT_TARGET_TRIPLE)
+ target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+ endif()
+ install(TARGETS libompdevice
+ PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+ DESTINATION ${OPENMP_INSTALL_LIBDIR})
add_library(ompdevice.all_objs OBJECT IMPORTED)
set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 213ccfe58b4fb..2c68109ca544d 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -131,7 +131,13 @@ struct IdentTy {
using __kmpc_impl_lanemask_t = LaneMaskTy;
-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+using FnPtrTy = __attribute__((address_space(ProgramAS))) void *;
+#else
+using FnPtrTy = void *;
+#endif
+
+using ParallelRegionFnTy = FnPtrTy;
using CriticalNameTy = int32_t[8];
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 94b5e65196067..8881cf46176fd 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,7 +16,16 @@
namespace ompx {
+// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
+// This is to provide a workaround to use regular printf that is used in the code.
+#if defined(__SPIRV__)
+template <size_t N, typename... Args>
+int printf(const char (&Format)[N], Args... args) {
+ return __spirv_ocl_printf(Format, args...);
+}
+#else
int printf(const char *Format, ...);
+#endif
} // namespace ompx
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index cd6013780a49c..31dc1540d7dd4 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
__builtin_unreachable();
}
-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index 34c945c979ffb..3782478932046 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -23,7 +23,7 @@ using namespace allocator;
// Provide a default implementation of malloc / free for AMDGPU platforms built
// without 'libc' support.
extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
#else
diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp
index 83f9233d94803..6934387952b7c 100644
--- a/openmp/device/src/LibC.cpp
+++ b/openmp/device/src/LibC.cpp
@@ -31,14 +31,16 @@ extern "C" {
for (size_t I = 0; I < count; ++I)
dstc[I] = C;
}
-
+#if !defined(__SPIRV__)
[[gnu::weak]] int printf(const char *Format, ...) {
__builtin_va_list vlist;
__builtin_va_start(vlist, Format);
return ::vprintf(Format, vlist);
}
+#endif
}
+#if !defined(__SPIRV__)
namespace ompx {
[[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
__builtin_va_list vlist;
@@ -46,3 +48,4 @@ namespace ompx {
return ::vprintf(Format, vlist);
}
} // namespace ompx
+#endif
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index bd2c0799ee9f0..9f74990ce43ea 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
// Invoke an outlined parallel function unwrapping arguments (up to 32).
[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
- int32_t bound_tid, void *fn,
+ int32_t bound_tid, FnPtrTy fn,
void **args, int64_t nargs) {
switch (nargs) {
#include "generated_microtask_cases.gen"
@@ -84,7 +84,7 @@ extern "C" {
[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
int32_t num_threads,
- void *fn, void **args,
+ FnPtrTy fn, void **args,
const int64_t nargs) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t NumThreads = determineNumberOfThreads(num_threads);
@@ -142,8 +142,8 @@ extern "C" {
[[clang::always_inline]] void
__kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
- int32_t num_threads, int proc_bind, void *fn,
- void *wrapper_fn, void **args, int64_t nargs,
+ int32_t num_threads, int proc_bind, FnPtrTy fn,
+ FnPtrTy wrapper_fn, void **args, int64_t nargs,
int32_t nt_strict) {
uint32_t TId = mapping::getThreadIdInBlock();
@@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
1u, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
- (void *)nullptr, true, ident,
+ (FnPtrTy) nullptr, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
/*ForceTeamState=*/true);
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 501dc4a291ed1..09edb8dc2d9cc 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -258,6 +258,79 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
#endif
///}
+#if defined(__SPIRV__)
+
+MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) {
+ switch (Ordering) {
+ default:
+ __builtin_unreachable();
+ case atomic::relaxed:
+ return MemorySemantics_t::Relaxed;
+ case atomic::acquire:
+ return MemorySemantics_t::Acquire;
...
[truncated]
|
jhuber6
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for taking a first look at this, I won't be able to test the functional changes required to get the DeviceRTL working so this really helps.
openmp/device/include/LibC.h
Outdated
|
|
||
| namespace ompx { | ||
|
|
||
| // SPIR-V backend does not support variadic functions except for __spirv_ocl_printf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Weird, I'll look at that.
@jhuber6, |
Users configure this manually, it'd be something like this, which should just work if we can get it to compile |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
Would you be able to look over #174910 and let me know if it actually works? I don't know how to actually run this stuff on my hardware so I'm mostly just going off of the spec. I'll probably merge it soon. |
It doesn't .
|
Hmmm, might be because SPIR-V doesn't handle implicit conversions between address spaces? I'll need to look into that.
To my knowledge, there's no way to handle this. I suppose I can provide it in the header but just hard-code it to always return false. |
|
Also, let me know functionally if those actually return the expected value when you run them on a GPU |
|
We'll need something like https://github.com/llvm/llvm-project/blob/main/clang/lib/Basic/Targets/AMDGPU.h#L123 for SPIR-V but I've got no clue how they handle their address spaces. |
There is some issue in SPIR-V with address spaces. I remember Nick fixed some issues but likely not all.
|
I made #175109 but I've got no clue if it's even legal. Someone who actually knows the backend let me know |
openmp/device/include/LibC.h
Outdated
|
|
||
| // SPIR-V backend does not support variadic functions except for __spirv_ocl_printf | ||
| // This is to provide a workaround to use regular printf that is used in the code. | ||
| // SPIR-V backend does not support variadic functions except for |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have #175076 up but will need some SPIR-V people to look at it.
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
Do you mean clang spirv builtins calls vs spirv API calls? The test pass rate for both cases was the same when I ran it. It was significantly lower than before but it is likely related to some other changes rather than those in spirvintrin.h |
openmp/device/include/DeviceTypes.h
Outdated
| using ParallelRegionFnTy = void *; | ||
| #ifdef __SPIRV__ | ||
| const uint32_t ProgramAS = 9; | ||
| using FnPtrTy = __attribute__((address_space(ProgramAS))) void *; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do the C++ attributes not work? And just make this constexpr or put the 9 directly in the attribute. Honestly I'm surprised that SPiR-V bothers with this at all, I can't imagine a single case it would make sense for a function to be something that isn't global.
|
|
||
| #if defined(__SPIRV__) | ||
|
|
||
| uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This bit was changed in a previous PR to use the __scoped_atomic_fetch_uinc builtin. This apparently fails on SPIR-V. Best solution, fix the backend because the default lowering should be a CAS loop. Faster solution, just copy the handling for the atomic multiply just below it and put an ifdef __SPIRV__ around the builtin and manual CAS loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copied the implementation for spirv backend until backend is fixed
| return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); | ||
| } | ||
|
|
||
| void fenceKernel(atomic::OrderingTy Ordering) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I should probably make these common, I thought I did that already.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I made #177710 hopefully this will let us simplify some of this if it lands without issue
|
Rebase this on trunk with the cleanups I did and we're probably good to go. |
Done |
|
Thanks for sticking with this |
|
@fineg74 Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR. Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues. How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>
This reverts commit 8db9774.
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: fineg74 <[email protected]> Co-authored-by: Joseph Huber <[email protected]>
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>
|
@jhuber6, Do you think we should enable build of DeviceRTL with SPIRV by default ? I know it is probably not of production quality yet, but the only potential user of it is l0 plugin that is not enabled by default. On the other hand it should make work of everybody who is working on that a little bit easier |
What do you mean by "default"? None of these are enabled by default, the user opts-into them. If you're asking about adding it to the |
What we should do is build it when the user selects "all" architectures. I had some trouble getting things set up, and that's in part because all didn't always include level_zero. That said, #178779 is certainly a blocker for a default build. If the user has an old llvm-link on the path it will error out at the moment. |
We already build the SPIR-V backend by default. The way people build the OpenMP device RTL is through CMake where you specify the architectures you want. We do want the convenience of a cache file but that should wait until we have a bot that can run a majority of the tests without failing. |
| ///} | ||
|
|
||
| #if defined(__SPIRV__) | ||
| void namedBarrierInit() { __builtin_trap(); } // TODO |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was my fault. Here cannot be a trap, only if you use the named barrier.
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs