Skip to content

Conversation

@jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Jan 8, 2026

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Jan 8, 2026
@llvmbot
Copy link
Member

llvmbot commented Jan 8, 2026

@llvm/pr-subscribers-backend-x86

Author: Joseph Huber (jhuber6)

Changes

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.


Full diff: https://github.com/llvm/llvm-project/pull/174910.diff

4 Files Affected:

  • (modified) clang/lib/Headers/CMakeLists.txt (+1)
  • (modified) clang/lib/Headers/gpuintrin.h (+2)
  • (added) clang/lib/Headers/spirvintrin.h (+171)
  • (modified) clang/test/Headers/gpuintrin_lang.c (+19-1)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(1);
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_spirv_subgroup_shuffle(__x,
+                                          __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads.
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(__x);
+  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+                                                        __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 //
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN:   -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
 // RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
 
 #ifdef __device__
 __device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]
-// CUDA-NEXT:    [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CUDA-NEXT:    ret i32 [[TMP0]]
 //
 // HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT:  [[ENTRY:.*:]]
+// SYCL-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT:    [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT:    ret i32 [[CONV_I]]
+//
 // C89-LABEL: define dso_local i32 @foo(
 // C89-SAME: ) #[[ATTR0:[0-9]+]] {
 // C89-NEXT:  [[ENTRY:.*:]]

@llvmbot
Copy link
Member

llvmbot commented Jan 8, 2026

@llvm/pr-subscribers-clang

Author: Joseph Huber (jhuber6)

Changes

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.


Full diff: https://github.com/llvm/llvm-project/pull/174910.diff

4 Files Affected:

  • (modified) clang/lib/Headers/CMakeLists.txt (+1)
  • (modified) clang/lib/Headers/gpuintrin.h (+2)
  • (added) clang/lib/Headers/spirvintrin.h (+171)
  • (modified) clang/test/Headers/gpuintrin_lang.c (+19-1)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(1);
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_spirv_subgroup_shuffle(__x,
+                                          __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads.
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(__x);
+  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+                                                        __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 //
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN:   -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
 // RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
 
 #ifdef __device__
 __device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]
-// CUDA-NEXT:    [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CUDA-NEXT:    ret i32 [[TMP0]]
 //
 // HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT:  [[ENTRY:.*:]]
+// SYCL-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT:    [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT:    ret i32 [[CONV_I]]
+//
 // C89-LABEL: define dso_local i32 @foo(
 // C89-SAME: ) #[[ATTR0:[0-9]+]] {
 // C89-NEXT:  [[ENTRY:.*:]]

@jhuber6
Copy link
Contributor Author

jhuber6 commented Jan 8, 2026

This will fail tests until the dependent PRs are merged. Inspecting the basic IR makes sense but I have no way to test this. Hopefully @sarnex can help here in the future because this should make porting the OpenMP support much easier.

The SPIR-V intrinsics are missing thread syncs, an exit, and the pointer introspections. No clue if I got the address spaces or the thread -> grid accessors right.

@github-actions
Copy link

github-actions bot commented Jan 8, 2026

🪟 Windows x64 Test Results

  • 53374 tests passed
  • 2214 tests skipped

✅ The build succeeded and all tests passed.

@github-actions
Copy link

github-actions bot commented Jan 8, 2026

🐧 Linux x64 Test Results

  • 112569 tests passed
  • 4604 tests skipped

✅ The build succeeded and all tests passed.

Copy link
Member

@sarnex sarnex left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm, but i asked greg from my team to also take a look at this since he's more familiar with what the correct logic should be

#define __gpu_generic __attribute__((address_space(4)))

// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we could unify all these and move it to gpuintrin.h and remove it from each target's header since i unified the attrs a while ago?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'll do a pass to simplify that in the future since it applies to the libc code as well.

@fineg74
Copy link
Contributor

fineg74 commented Jan 8, 2026

Couple of questions:

  1. The implementation does not contain some API that for example nvptxintrin.h and amdgpuintrin.h containe like __gpu_is_ptr_private/__gpu_is_ptr_local/__gpu_sync_threads which are used across the code.
  2. Is there a reason to introduce and use new builtin intrinsics vs existing spirv builtin API (https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_spirv_builtins.h) like I did here: [OFFLOAD] Build DeviceRTL with SPIRV backend #174675 ?

@jhuber6
Copy link
Contributor Author

jhuber6 commented Jan 8, 2026

Couple of questions:

1. The implementation does not contain some API that for example nvptxintrin.h and amdgpuintrin.h containe like __gpu_is_ptr_private/__gpu_is_ptr_local/__gpu_sync_threads which are used across the code.

Yes, these aren't intended to be a completely inclusive set. I'm working on exposing OpControlBarrier right now so that will be added. The address space introspection isn't necessarily required. SPIR-V has OpTerminateInvocation but it only applies to Fragment shaders for some reason so there's no real exit solution, similar story with scheduling helpers but that one can safely be a no-op.

2. Is there a reason to introduce and use new builtin intrinsics vs existing spirv builtin API (https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_spirv_builtins.h) like I did here: [ [OFFLOAD] Build DeviceRTL with SPIRV backend #174675](https://github.com/llvm/llvm-project/pull/174675) ?

I don't know exactly how SPIR-V works. It seems that some things are resolved as external functions and hooked up by some Khronos tool? I'd prefer if we moved away from that now that we have a backend. Correct me if I'm wrong here. The proper way of doing this is always builtins to LLVM backend intrinsics, everything else is more of a temporary hack as far as I'm aware.

The clang_spirv_builtins seems to be more like a level of indirection over whether or not we're using the translator or the backend, while these helpers are more about abstracting the same compute-type behavior across the various GPU targets. I'd prefer we move towards using the backend for everything, but I don't know if that conflicts with some internal Intel goal or something.

Summary:
llvm#174862 and
llvm#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with  the backend double-checking
these.
@jhuber6 jhuber6 merged commit 5c43243 into llvm:main Jan 9, 2026
10 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jan 9, 2026

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-4 while building clang at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/33957

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
rm -rf /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp && mkdir -p /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# executed command: rm -rf /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# note: command had no output on stdout or stderr
# executed command: mkdir -p /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# note: command had no output on stdout or stderr
# RUN: at line 2
/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-mc -triple=arm64-apple-darwin -filetype=obj -o /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-mc -triple=arm64-apple-darwin -filetype=obj -o /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# .---command stderr------------
# | /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s:16:2: warning: .build_version macos used while targeting darwin
# |         .build_version macos, 26, 0
# |         ^
# `-----------------------------
# RUN: at line 3
not --crash /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -debugger-support=false      -write-symtab /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o      > /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt 2>&1
# executed command: not --crash /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -debugger-support=false -write-symtab /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o
# note: command had no output on stdout or stderr
# RUN: at line 6
/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -symbolicate-with /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt      | /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -symbolicate-with /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt
# note: command had no output on stdout or stderr
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# .---command stderr------------
# | �[1m/Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s:14:10: �[0m�[0;1;31merror: �[0m�[1mCHECK: expected string not found in input
�[0m# | �[1m�[0m# CHECK: this_should_crash {{.*}} ({{.*}}crash.o)
# | �[0;1;32m         ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:1:1: �[0m�[0;1;30mnote: �[0m�[1mscanning from here
�[0m# | �[1m�[0mPLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug.
# | �[0;1;32m^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:9:161: �[0m�[0;1;30mnote: �[0m�[1mpossible intended match here
�[0m# | �[1m�[0m4 (error) ffff800104804010 _main + 18446603336221196276 (/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o)
# | �[0;1;32m                                                                                                                                                                ^
�[0m# | �[0;1;32m�[0m
# | Input file: <stdin>
# | Check file: /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# | �[1m�[0m�[0;1;30m            1: �[0m�[1m�[0;1;46mPLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug. �[0m
# | �[0;1;31mcheck:14'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
�[0m# | �[0;1;31m�[0m�[0;1;30m            2: �[0m�[1m�[0;1;46mStack dump: �[0m
# | �[0;1;31mcheck:14'0     ~~~~~~~~~~~~
...

Priyanshu3820 pushed a commit to Priyanshu3820/llvm-project that referenced this pull request Jan 18, 2026
Summary:
llvm#174862 and
llvm#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with  the backend double-checking
these.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants