-
Notifications
You must be signed in to change notification settings - Fork 16k
[SPIR-V] Initial support for SPIR-V in gpuintrin.h
#174910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Joseph Huber (jhuber6) ChangesSummary: This should be the first step towards allowing SPIR-V to build things Would appreciate someone more familiar with the backend double-checking Full diff: https://github.com/llvm/llvm-project/pull/174910.diff 4 Files Affected:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
gpuintrin.h
nvptxintrin.h
amdgpuintrin.h
+ spirvintrin.h
)
set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
#include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
#elif !defined(_OPENMP)
#error "This header is only meant to be used on GPU architectures."
#endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+ return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+ return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+ return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+ return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+ return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+ return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+ return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+ return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+ return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+ return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+ return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+ return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+ return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+ return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+ uint32_t [[clang::ext_vector_type(4)]] __mask =
+ __builtin_spirv_subgroup_ballot(1);
+ return __builtin_bit_cast(uint64_t,
+ __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __builtin_spirv_subgroup_shuffle(__x,
+ __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+ bool __x) {
+ // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+ // the active threads.
+ uint32_t [[clang::ext_vector_type(4)]] __mask =
+ __builtin_spirv_subgroup_ballot(__x);
+ return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+ __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
// RUN: -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefix=OPENMP
//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN: -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN: -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
// RUN: -std=c89 -internal-isystem %S/../../lib/Headers/ \
// RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
#ifdef __device__
__device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
#else
// CUDA-LABEL: define dso_local i32 @foo(
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
// CUDA-NEXT: [[ENTRY:.*:]]
-// CUDA-NEXT: [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CUDA-NEXT: ret i32 [[TMP0]]
//
// HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
// OPENMP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// OPENMP-NEXT: ret i32 [[TMP0]]
//
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT: [[ENTRY:.*:]]
+// SYCL-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT: [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT: ret i32 [[CONV_I]]
+//
// C89-LABEL: define dso_local i32 @foo(
// C89-SAME: ) #[[ATTR0:[0-9]+]] {
// C89-NEXT: [[ENTRY:.*:]]
|
|
@llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: This should be the first step towards allowing SPIR-V to build things Would appreciate someone more familiar with the backend double-checking Full diff: https://github.com/llvm/llvm-project/pull/174910.diff 4 Files Affected:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
gpuintrin.h
nvptxintrin.h
amdgpuintrin.h
+ spirvintrin.h
)
set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
#include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
#elif !defined(_OPENMP)
#error "This header is only meant to be used on GPU architectures."
#endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+ return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+ return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+ return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+ return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+ return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+ return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+ return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+ return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+ return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+ return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+ return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+ return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+ return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+ return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+ uint32_t [[clang::ext_vector_type(4)]] __mask =
+ __builtin_spirv_subgroup_ballot(1);
+ return __builtin_bit_cast(uint64_t,
+ __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __builtin_spirv_subgroup_shuffle(__x,
+ __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+ bool __x) {
+ // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+ // the active threads.
+ uint32_t [[clang::ext_vector_type(4)]] __mask =
+ __builtin_spirv_subgroup_ballot(__x);
+ return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+ __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
// RUN: -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefix=OPENMP
//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN: -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN: -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
// RUN: -std=c89 -internal-isystem %S/../../lib/Headers/ \
// RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
#ifdef __device__
__device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
#else
// CUDA-LABEL: define dso_local i32 @foo(
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
// CUDA-NEXT: [[ENTRY:.*:]]
-// CUDA-NEXT: [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CUDA-NEXT: ret i32 [[TMP0]]
//
// HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
// OPENMP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// OPENMP-NEXT: ret i32 [[TMP0]]
//
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT: [[ENTRY:.*:]]
+// SYCL-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT: [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT: ret i32 [[CONV_I]]
+//
// C89-LABEL: define dso_local i32 @foo(
// C89-SAME: ) #[[ATTR0:[0-9]+]] {
// C89-NEXT: [[ENTRY:.*:]]
|
|
This will fail tests until the dependent PRs are merged. Inspecting the basic IR makes sense but I have no way to test this. Hopefully @sarnex can help here in the future because this should make porting the OpenMP support much easier. The SPIR-V intrinsics are missing thread syncs, an exit, and the pointer introspections. No clue if I got the address spaces or the thread -> grid accessors right. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
sarnex
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm, but i asked greg from my team to also take a look at this since he's more familiar with what the correct logic should be
| #define __gpu_generic __attribute__((address_space(4))) | ||
|
|
||
| // Attribute to declare a function as a kernel. | ||
| #define __gpu_kernel __attribute__((device_kernel, visibility("protected"))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe we could unify all these and move it to gpuintrin.h and remove it from each target's header since i unified the attrs a while ago?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I'll do a pass to simplify that in the future since it applies to the libc code as well.
|
Couple of questions:
|
Yes, these aren't intended to be a completely inclusive set. I'm working on exposing
I don't know exactly how SPIR-V works. It seems that some things are resolved as external functions and hooked up by some Khronos tool? I'd prefer if we moved away from that now that we have a backend. Correct me if I'm wrong here. The proper way of doing this is always builtins to LLVM backend intrinsics, everything else is more of a temporary hack as far as I'm aware. The |
7a4076d to
b4ba797
Compare
Summary: llvm#174862 and llvm#174655 provided the intrinsics required to get the fundamental operations working for these. This patch sets up the basic support (as far as I know). This should be the first step towards allowing SPIR-V to build things like the LLVM libc and the OpenMP Device Runtime Library. The implementations here are intentionally inefficient, such as not using the dedicated SPIR-V opcode for read firstlane. This is just to start and hopefully start testing things later. Would appreciate someone more familiar with the backend double-checking these.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/33957 Here is the relevant piece of the build log for the reference |
Summary: llvm#174862 and llvm#174655 provided the intrinsics required to get the fundamental operations working for these. This patch sets up the basic support (as far as I know). This should be the first step towards allowing SPIR-V to build things like the LLVM libc and the OpenMP Device Runtime Library. The implementations here are intentionally inefficient, such as not using the dedicated SPIR-V opcode for read firstlane. This is just to start and hopefully start testing things later. Would appreciate someone more familiar with the backend double-checking these.
Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).
This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.
Would appreciate someone more familiar with the backend double-checking
these.