[OFFLOAD] Build DeviceRTL with SPIRV backend #174675

fineg74 · 2026-01-07T00:54:07Z

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs

github-actions · 2026-01-07T00:54:27Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2026-01-07T00:54:59Z

@llvm/pr-subscribers-backend-x86

Author: None (fineg74)

Changes

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs

Patch is 22.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174675.diff

11 Files Affected:

(modified) clang/lib/Headers/CMakeLists.txt (+1)
(modified) clang/lib/Headers/gpuintrin.h (+3-1)
(added) clang/lib/Headers/spirvintrin.h (+235)
(modified) openmp/device/CMakeLists.txt (+37-23)
(modified) openmp/device/include/DeviceTypes.h (+7-1)
(modified) openmp/device/include/LibC.h (+9)
(modified) openmp/device/include/State.h (+1-1)
(modified) openmp/device/src/Allocator.cpp (+1-1)
(modified) openmp/device/src/LibC.cpp (+4-1)
(modified) openmp/device/src/Parallelism.cpp (+5-5)
(modified) openmp/device/src/Synchronization.cpp (+73)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..8b75cc14878e3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -18,7 +18,7 @@
 #define __GPUINTRIN_H
 
 #if !defined(_DEFAULT_FN_ATTRS)
-#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__HIP__) || defined(__CUDA__) || defined(__SPIRV__)
 #define _DEFAULT_FN_ATTRS __attribute__((device))
 #else
 #define _DEFAULT_FN_ATTRS
@@ -62,6 +62,8 @@ _Pragma("omp end declare target");
 #include <amdgpuintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..e66a2bf0767a6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,235 @@
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIRV targets or offloading to SPIRV"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+#include <stdint.h>
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#define true 1
+#define false 0
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+//
+#define __gpu_private  __attribute__((address_space(0)))
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((spirv_kernel, visibility("protected")))
+#define __SPIRV_VAR_QUALIFIERS extern const
+// Workgroup and invocation ID functions
+uint64_t __spirv_BuiltInNumWorkgroups(int i);
+uint64_t __spirv_BuiltInWorkgroupId(int i);
+uint64_t __spirv_BuiltInWorkgroupSize(int i);
+uint64_t __spirv_BuiltInLocalInvocationId(int i);
+
+typedef enum {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4
+} Scope_t;
+
+typedef enum {
+  Relaxed = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10
+} MemorySemantics_t;
+
+using unsigned ProgramAS = 9;
+
+#ifdef __cplusplus
+template <typename... Args> int __spirv_ocl_printf(Args...);
+#endif
+
+// Subgroup
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
+__SPIRV_VAR_QUALIFIERS uint32_t __spirv_BuiltInSubgroupSize;
+
+// Group non-uniform operations
+uint64_t __spirv_GroupNonUniformBallot(uint32_t execution_scope,
+                                       bool predicate);
+uint32_t __spirv_GroupNonUniformBroadcastFirst(uint32_t execution_scope,
+                                               uint32_t value);
+uint32_t __spirv_GroupNonUniformShuffle(uint32_t execution_scope,
+                                        uint32_t value, uint32_t id);
+
+// Synchronization
+void __spirv_ControlBarrier(uint32_t execution_scope, uint32_t memory_scope,
+                            uint32_t semantics);
+void __spirv_MemoryBarrier(uint32_t memory_scope, uint32_t semantics);
+
+// Atomic
+uint32_t __spirv_AtomicIAdd(uint32_t *, int, int, uint32_t);
+void __spirv_AtomicStore(int32_t *, int, int, int);
+int32_t __spirv_AtomicLoad(int32_t *, int, int);
+int32_t __spirv_AtomicCompareExchange(int32_t *, int, int, int, int, int);
+
+
+// Returns the number of blocks in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __spirv_BuiltInNumWorkgroups(0);
+}
+
+// Returns the number of blocks in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __spirv_BuiltInNumWorkgroups(1);
+}
+
+// Returns the number of blocks in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __spirv_BuiltInNumWorkgroups(2);
+}
+
+// Returns the 'x' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __spirv_BuiltInWorkgroupId(0);
+}
+
+// Returns the 'y' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __spirv_BuiltInWorkgroupId(1);
+}
+
+// Returns the 'z' dimension of the current block's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __spirv_BuiltInWorkgroupId(2);
+}
+
+// Returns the number of threads in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __spirv_BuiltInWorkgroupSize(0);
+}
+
+// Returns the number of threads in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __spirv_BuiltInWorkgroupSize(1);
+}
+
+// Returns the number of threads in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __spirv_BuiltInWorkgroupSize(2);
+}
+
+// Returns the 'x' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __spirv_BuiltInLocalInvocationId(0);
+}
+
+// Returns the 'y' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __spirv_BuiltInLocalInvocationId(1);
+}
+
+// Returns the 'z' dimension id of the thread in the current block.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __spirv_BuiltInLocalInvocationId(2);
+}
+
+// Returns the size of a warp, always 32 on NVIDIA hardware.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __spirv_BuiltInSubgroupSize;
+}
+
+// Returns the id of the thread inside of a warp executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __spirv_BuiltInSubgroupLocalInvocationId;
+}
+ 
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { 
+  uint32_t Size = __gpu_num_lanes();
+  return ((uint64_t)1 << Size) - (uint64_t)1;
+}
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __spirv_GroupNonUniformBroadcastFirst(3, __x);
+}
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  uint64_t ballot = __spirv_GroupNonUniformBallot(3, __x);
+  return __lane_mask & ballot;
+}
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+   __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Workgroup,
+                          0x100 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+   __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Subgroup,
+                          0x80 | MemorySemantics_t::SequentiallyConsistent);
+}
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __spirv_GroupNonUniformShuffle(3, __x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Returns true if the flat pointer points to 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return false; // TODO
+}
+// Returns true if the flat pointer points to 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return false; // TODO
+}
+// Terminates execution of the calling thread.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+}
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#if !defined(__cplusplus)
+_Pragma("pop_macro(\"bool\")");
+#endif
+#endif // __SPIRVINTRIN_H
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..0dc43ac034225 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -25,14 +25,18 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
 )
 
-list(APPEND compile_options -flto)
-list(APPEND compile_options -fvisibility=hidden)
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+   NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  list(APPEND compile_options -flto)
+  list(APPEND compile_options -fvisibility=hidden)
+  list(APPEND compile_options -Wno-unknown-cuda-version)
+endif()
 list(APPEND compile_options -nogpulib)
 list(APPEND compile_options -nostdlibinc)
 list(APPEND compile_options -fno-rtti)
 list(APPEND compile_options -fno-exceptions)
 list(APPEND compile_options -fconvergent-functions)
-list(APPEND compile_options -Wno-unknown-cuda-version)
+
 if(LLVM_DEFAULT_TARGET_TRIPLE)
   list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
 endif()
@@ -43,7 +47,7 @@ endif()
 # instructions yet and we end up missing out on way more important constant
 # propagation. That said, we will run the vectorizer again after the runtime
 # has been linked into the user program.
-list(APPEND compile_options "SHELL: -mllvm -vectorize-slp=false")
+list(APPEND compile_options -mllvm -vectorize-slp=false)
 if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
    "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn")
   set(target_name "amdgpu")
@@ -52,6 +56,10 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
        "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
   set(target_name "nvptx")
   list(APPEND compile_options --cuda-feature=+ptx63)
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
+       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
+  set(target_name "spirv") 
+  list(APPEND compile_options -emit-llvm -c)
 endif()
 
 # Trick to combine these into a bitcode file via the linker's LTO pass.
@@ -62,26 +70,32 @@ set_target_properties(libompdevice PROPERTIES
   BUILD_RPATH ""
   INSTALL_RPATH ""
   RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(TARGET libc)
-  target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
-endif()
-target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-
-target_include_directories(libompdevice PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR}/include
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
-                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
-target_compile_options(libompdevice PRIVATE ${compile_options})
-target_link_options(libompdevice PRIVATE
+ 
+  # If the user built with the GPU C library enabled we will use that instead.
+  if(TARGET libc)
+    target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
+  endif()
+  target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+  
+  target_include_directories(libompdevice PRIVATE
+                             ${CMAKE_CURRENT_SOURCE_DIR}/include
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
+                             ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
+  target_compile_options(libompdevice PRIVATE ${compile_options})
+  if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+     NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")  
+    target_link_options(libompdevice PRIVATE
                     "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
-if(LLVM_DEFAULT_TARGET_TRIPLE)
-  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
-endif()
-install(TARGETS libompdevice
-        PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-        DESTINATION ${OPENMP_INSTALL_LIBDIR})
+  else()
+    target_link_options(libompdevice PRIVATE
+                    "-nostdlib" "-emit-llvm" "-Wl")
+  endif()
+  if(LLVM_DEFAULT_TARGET_TRIPLE)
+    target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
+  endif()
+  install(TARGETS libompdevice
+          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+          DESTINATION ${OPENMP_INSTALL_LIBDIR})
 
 add_library(ompdevice.all_objs OBJECT IMPORTED)
 set_property(TARGET ompdevice.all_objs APPEND PROPERTY IMPORTED_OBJECTS
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 213ccfe58b4fb..2c68109ca544d 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -131,7 +131,13 @@ struct IdentTy {
 
 using __kmpc_impl_lanemask_t = LaneMaskTy;
 
-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+using FnPtrTy = __attribute__((address_space(ProgramAS))) void *;
+#else
+using FnPtrTy = void *;
+#endif
+
+using ParallelRegionFnTy = FnPtrTy;
 
 using CriticalNameTy = int32_t[8];
 
diff --git a/openmp/device/include/LibC.h b/openmp/device/include/LibC.h
index 94b5e65196067..8881cf46176fd 100644
--- a/openmp/device/include/LibC.h
+++ b/openmp/device/include/LibC.h
@@ -16,7 +16,16 @@
 
 namespace ompx {
 
+// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
+// This is to provide a workaround to use regular printf that is used in the code.
+#if defined(__SPIRV__)
+template <size_t N, typename... Args>
+int printf(const char (&Format)[N], Args... args) {
+  return __spirv_ocl_printf(Format, args...);
+}
+#else
 int printf(const char *Format, ...);
+#endif
 
 } // namespace ompx
 
diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h
index cd6013780a49c..31dc1540d7dd4 100644
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   __builtin_unreachable();
 }
 
-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_ParallelRegionFn:
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index 34c945c979ffb..3782478932046 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -23,7 +23,7 @@ using namespace allocator;
 // Provide a default implementation of malloc / free for AMDGPU platforms built
 // without 'libc' support.
 extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
 #else
diff --git a/openmp/device/src/LibC.cpp b/openmp/device/src/LibC.cpp
index 83f9233d94803..6934387952b7c 100644
--- a/openmp/device/src/LibC.cpp
+++ b/openmp/device/src/LibC.cpp
@@ -31,14 +31,16 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
-
+#if !defined(__SPIRV__)
 [[gnu::weak]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
   __builtin_va_start(vlist, Format);
   return ::vprintf(Format, vlist);
 }
+#endif
 }
 
+#if !defined(__SPIRV__)
 namespace ompx {
 [[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
   __builtin_va_list vlist;
@@ -46,3 +48,4 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
+#endif
diff --git a/openmp/device/src/Parallelism.cpp b/openmp/device/src/Parallelism.cpp
index bd2c0799ee9f0..9f74990ce43ea 100644
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
+                                              int32_t bound_tid, FnPtrTy fn,
                                               void **args, int64_t nargs) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
@@ -84,7 +84,7 @@ extern "C" {
 
 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
                                                    int32_t num_threads,
-                                                   void *fn, void **args,
+                                                   FnPtrTy fn, void **args,
                                                    const int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
@@ -142,8 +142,8 @@ extern "C" {
 
 [[clang::always_inline]] void
 __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs,
+                   int32_t num_threads, int proc_bind, FnPtrTy fn,
+                   FnPtrTy wrapper_fn, void **args, int64_t nargs,
                    int32_t nt_strict) {
   uint32_t TId = mapping::getThreadIdInBlock();
 
@@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
+                                          (FnPtrTy) nullptr, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                      /*ForceTeamState=*/true);
diff --git a/openmp/device/src/Synchronization.cpp b/openmp/device/src/Synchronization.cpp
index 501dc4a291ed1..09edb8dc2d9cc 100644
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@@ -258,6 +258,79 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 #endif
 ///}
 
+#if defined(__SPIRV__)
+
+MemorySemantics_t convertOrderingType(atomic::OrderingTy Ordering) {
+  switch (Ordering) {
+  default:
+    __builtin_unreachable();
+  case atomic::relaxed:
+    return MemorySemantics_t::Relaxed;
+  case atomic::acquire:
+    return MemorySemantics_t::Acquire;
...
[truncated]

jhuber6

Thanks for taking a first look at this, I won't be able to test the functional changes required to get the DeviceRTL working so this really helps.

jhuber6 · 2026-01-08T20:33:13Z

openmp/device/include/LibC.h


 namespace ompx {

+// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf


Weird, I'll look at that.

openmp/device/CMakeLists.txt

fineg74 · 2026-01-08T21:45:41Z

Thanks for taking a first look at this, I won't be able to test the functional changes required to get the DeviceRTL working so this really helps.

@jhuber6,
This change won't build the DeviceRTL for SPIRV by default as I didn't think somebody works on that as it was designed as a first step to support Level Zero work which is still needs work to become fully operational as it requires spirv implementations for libc/libcxx. But since you are actively working on that, do you want me to enable the spirv build for DeviceRTL by default ?

jhuber6 · 2026-01-08T21:47:26Z

do you want me to enable the spirv build for DeviceRTL by default ?

Users configure this manually, it'd be something like this, which should just work if we can get it to compile

-DRUNTIMES_spirv64-unknown-unknown_LLVM_ENABLE_RUNTIMES=openmp

github-actions · 2026-01-08T22:42:03Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jhuber6 · 2026-01-08T22:43:01Z

Would you be able to look over #174910 and let me know if it actually works? I don't know how to actually run this stuff on my hardware so I'm mostly just going off of the spec. I'll probably merge it soon.

fineg74 · 2026-01-09T00:29:20Z

Would you be able to look over #174910 and let me know if it actually works? I don't know how to actually run this stuff on my hardware so I'm mostly just going off of the spec. I'll probably merge it soon.

It doesn't .

It complains about changes in address space in multiple places. Resolved it by commenting out __gpu_constant and __gpu_local
It complain about lack of __gpu_is_ptr_local. Resolved by dummy implementation

jhuber6 · 2026-01-09T00:34:26Z

It complains about changes in address space in multiple places. Resolved it by commenting out __gpu_constant and __gpu_local

Hmmm, might be because SPIR-V doesn't handle implicit conversions between address spaces? I'll need to look into that.

It complain about lack of __gpu_is_ptr_local. Resolved by dummy implementation

To my knowledge, there's no way to handle this. I suppose I can provide it in the header but just hard-code it to always return false.

jhuber6 · 2026-01-09T00:45:39Z

Also, let me know functionally if those actually return the expected value when you run them on a GPU

jhuber6 · 2026-01-09T00:48:04Z

We'll need something like https://github.com/llvm/llvm-project/blob/main/clang/lib/Basic/Targets/AMDGPU.h#L123 for SPIR-V but I've got no clue how they handle their address spaces.

fineg74 · 2026-01-09T01:51:19Z

It complains about changes in address space in multiple places. Resolved it by commenting out __gpu_constant and __gpu_local

Hmmm, might be because SPIR-V doesn't handle implicit conversions between address spaces? I'll need to look into that.

There is some issue in SPIR-V with address spaces. I remember Nick fixed some issues but likely not all.

It complain about lack of __gpu_is_ptr_local. Resolved by dummy implementation

To my knowledge, there's no way to handle this. I suppose I can provide it in the header but just hard-code it to always return false.
I also don't know how to do it in pure SPIRV way and there are many other APIs that are not supported by SPIRV so it is probably OK to provide some dummy implementations

jhuber6 · 2026-01-09T01:53:50Z

There is some issue in SPIR-V with address spaces. I remember Nick fixed some issues but likely not all.

I made #175109 but I've got no clue if it's even legal. Someone who actually knows the backend let me know

jhuber6 · 2026-01-09T02:17:58Z

openmp/device/include/LibC.h


-// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf
-// This is to provide a workaround to use regular printf that is used in the code.
+// SPIR-V backend does not support variadic functions except for


I have #175076 up but will need some SPIR-V people to look at it.

github-actions · 2026-01-09T02:49:57Z

🐧 Linux x64 Test Results

112581 tests passed
4602 tests skipped

✅ The build succeeded and all tests passed.

fineg74 · 2026-01-09T03:39:32Z

Also, let me know functionally if those actually return the expected value when you run them on a GPU

Do you mean clang spirv builtins calls vs spirv API calls? The test pass rate for both cases was the same when I ran it. It was significantly lower than before but it is likely related to some other changes rather than those in spirvintrin.h

jhuber6 · 2026-01-23T23:51:44Z

openmp/device/include/DeviceTypes.h

-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+const uint32_t ProgramAS = 9;
+using FnPtrTy = __attribute__((address_space(ProgramAS))) void *;


Do the C++ attributes not work? And just make this constexpr or put the 9 directly in the attribute. Honestly I'm surprised that SPiR-V bothers with this at all, I can't imagine a single case it would make sense for a function to be something that isn't global.

jhuber6 · 2026-01-23T23:52:55Z

openmp/device/src/Synchronization.cpp


+#if defined(__SPIRV__)
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,


This bit was changed in a previous PR to use the __scoped_atomic_fetch_uinc builtin. This apparently fails on SPIR-V. Best solution, fix the backend because the default lowering should be a CAS loop. Faster solution, just copy the handling for the atomic multiply just below it and put an ifdef __SPIRV__ around the builtin and manual CAS loop.

Copied the implementation for spirv backend until backend is fixed

jhuber6 · 2026-01-23T23:58:51Z

openmp/device/src/Synchronization.cpp

+  return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
+}
+
+void fenceKernel(atomic::OrderingTy Ordering) {


I should probably make these common, I thought I did that already.

I made #177710 hopefully this will let us simplify some of this if it lands without issue

jhuber6 · 2026-01-26T20:21:22Z

Rebase this on trunk with the cleanups I did and we're probably good to go.

fineg74 · 2026-01-26T22:51:24Z

Rebase this on trunk with the cleanups I did and we're probably good to go.

Done

jhuber6 · 2026-01-27T00:04:57Z

Thanks for sticking with this

github-actions · 2026-01-27T00:09:54Z

@fineg74 Congratulations on having your first Pull Request (PR) merged into the LLVM Project!

Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR.

Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues.

How to do this, and the rest of the post-merge process, is covered in detail here.

If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again.

If you don't get any reports, no action is required from you. Your changes are working as expected, well done!

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>

This reverts commit 8db9774.

…m#174675)

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: fineg74 <[email protected]> Co-authored-by: Joseph Huber <[email protected]>

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <[email protected]>

fineg74 · 2026-01-29T18:26:58Z

@jhuber6, Do you think we should enable build of DeviceRTL with SPIRV by default ? I know it is probably not of production quality yet, but the only potential user of it is l0 plugin that is not enabled by default. On the other hand it should make work of everybody who is working on that a little bit easier

jhuber6 · 2026-01-29T18:29:50Z

@jhuber6, Do you think we should enable build of DeviceRTL with SPIRV by default ? I know it is probably not of production quality yet, but the only potential user of it is l0 plugin that is not enabled by default. On the other hand it should make work of everybody who is working on that a little bit easier

What do you mean by "default"? None of these are enabled by default, the user opts-into them. If you're asking about adding it to the Offload.cmake cache file that gives some build directions that should wait until it's a functional toolchain. You could add this to a list of supported architectures however, since we only have documentation examples for amdgcn and nvptx64.

jdoerfert · 2026-01-30T02:20:26Z

@jhuber6, Do you think we should enable build of DeviceRTL with SPIRV by default ? I know it is probably not of production quality yet, but the only potential user of it is l0 plugin that is not enabled by default. On the other hand it should make work of everybody who is working on that a little bit easier

What we should do is build it when the user selects "all" architectures. I had some trouble getting things set up, and that's in part because all didn't always include level_zero.

That said, #178779 is certainly a blocker for a default build. If the user has an old llvm-link on the path it will error out at the moment.
Also, https://github.com/llvm/llvm-project/pull/128513/changes#r2740142094 is a blocker. (Or some other fix for the issue).

jhuber6 · 2026-01-30T04:07:57Z

What we should do is build it when the user selects "all" architectures. I had some trouble getting things set up, and that's in part because all didn't always include level_zero.

We already build the SPIR-V backend by default. The way people build the OpenMP device RTL is through CMake where you specify the architectures you want. We do want the convenience of a cache file but that should wait until we have a bot that can run a majority of the tests without failing.

jdoerfert · 2026-01-31T00:55:27Z

openmp/device/src/Synchronization.cpp

 ///}

+#if defined(__SPIRV__)
+void namedBarrierInit() { __builtin_trap(); } // TODO


This was my fault. Here cannot be a trap, only if you use the named barrier.

fineg74 and others added 5 commits November 24, 2025 16:30

Build DeviceRTL with spirv backend

996c2e1

Merge branch 'main' into l0RTL

5e746a8

Merge remote-tracking branch 'origin/main' into l0RTL

3587050

Address PR comments

84def59

Fix formatting issues

5b6f225

llvmbot added backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics openmp:libomp OpenMP host runtime labels Jan 7, 2026

fineg74 mentioned this pull request Jan 8, 2026

[SPIR-V] Initial support for SPIR-V in gpuintrin.h #174910

Merged

jhuber6 reviewed Jan 8, 2026

View reviewed changes

Address PR comments

dc78bad

Fix formatting

e98cf56

jhuber6 reviewed Jan 9, 2026

View reviewed changes

fineg74 added 3 commits January 8, 2026 19:21

Allow build of DeviceRTL for SPIRV backend

40dfa28

Fix format

2922a82

Fix formatting

a18c339

Address PR comments

e66a4e4

karolzwolak mentioned this pull request Jan 23, 2026

add spirv64-intel-unknown target triple rust-lang/rust#151549

Draft

Merge remote-tracking branch 'origin/main' into l0RTL

867d96a

jhuber6 mentioned this pull request Jan 23, 2026

[OFFLOAD] Add plugin with support for Intel oneAPI Level Zero #158900

Merged

jhuber6 reviewed Jan 23, 2026

View reviewed changes

fineg74 added 3 commits January 23, 2026 17:16

Address PR comments

f92b77e

Fix formatting

7e5e695

Use c++ style attributes

bb719be

fineg74 added 2 commits January 26, 2026 14:34

Merge remote-tracking branch 'origin/main' into l0RTL

b9875fd

Cleanup based on trunk rebase

e2cc882

Fix formatting

5e20646

jhuber6 approved these changes Jan 27, 2026

View reviewed changes

jhuber6 enabled auto-merge (squash) January 27, 2026 00:04

jhuber6 merged commit 8db9774 into llvm:main Jan 27, 2026
11 checks passed

fineg74 deleted the l0RTL branch January 27, 2026 00:10

ronlieb added a commit to ROCm/llvm-project that referenced this pull request Jan 27, 2026

Revert " [OFFLOAD] Build DeviceRTL with SPIRV backend (llvm#174675)"

4c8959b

This reverts commit 8db9774.

ronlieb added a commit to ROCm/llvm-project that referenced this pull request Jan 27, 2026

revert_patches.txt: [OFFLOAD] Build DeviceRTL with SPIRV backend (llv…

edd5c52

…m#174675)

jdoerfert reviewed Jan 31, 2026

View reviewed changes


		namespace ompx {

		// SPIR-V backend does not support variadic functions except for __spirv_ocl_printf


		#if defined(__SPIRV__)

		uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,

[OFFLOAD] Build DeviceRTL with SPIRV backend #174675

[OFFLOAD] Build DeviceRTL with SPIRV backend #174675

Conversation

fineg74 commented Jan 7, 2026

Uh oh!

github-actions bot commented Jan 7, 2026

Uh oh!

llvmbot commented Jan 7, 2026

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

fineg74 commented Jan 8, 2026

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

github-actions bot commented Jan 8, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

fineg74 commented Jan 9, 2026

Uh oh!

jhuber6 commented Jan 9, 2026

Uh oh!

jhuber6 commented Jan 9, 2026

Uh oh!

jhuber6 commented Jan 9, 2026

Uh oh!

fineg74 commented Jan 9, 2026

Uh oh!

jhuber6 commented Jan 9, 2026

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Jan 9, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

fineg74 commented Jan 9, 2026

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented Jan 26, 2026

Uh oh!

fineg74 commented Jan 26, 2026

Uh oh!

jhuber6 commented Jan 27, 2026

Uh oh!

Uh oh!

github-actions bot commented Jan 27, 2026

Uh oh!

fineg74 commented Jan 29, 2026

Uh oh!

jhuber6 commented Jan 29, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jdoerfert commented Jan 30, 2026

Uh oh!

jhuber6 commented Jan 30, 2026

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

github-actions bot commented Jan 8, 2026 •

edited

Loading

github-actions bot commented Jan 9, 2026 •

edited

Loading

jhuber6 commented Jan 29, 2026 •

edited

Loading