[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE'#113156
Merged
Conversation
Summary: This is going to be deprecated in llvm#112849. This patch ports it to use the builtin instead. This isn't a compile constant, so it could slightly negatively affect codegen. There really should be an IR pass to turn it into a constant if the function has known attributes. Using the builtin is correct when we just do it for knowing the size like we do here. Obviously guarding w32/w64 code with this check would be broken.
Member
|
@llvm/pr-subscribers-offload Author: Joseph Huber (jhuber6) ChangesSummary: Using the builtin is correct when we just do it for knowing the size Full diff: https://github.com/llvm/llvm-project/pull/113156.diff 3 Files Affected:
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..881bd12f034051 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
uint32_t getWarpIdInBlock();
uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-const llvm::omp::GV &getGridValue() {
- return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
LaneMaskTy activemask() { return __nvvm_activemask(); }
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
#pragma omp end declare variant
///}
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
} // namespace impl
} // namespace ompx
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 101d1255f0d670..b810fb404b58f6 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
#include <stdio.h>
#include <stdlib.h>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
int main(int argc, char *argv[]) {
const int num_blocks = 1;
const int block_size = 256;
const int N = num_blocks * block_size;
int *res = (int *)malloc(N * sizeof(int));
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
- map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks) \
+ thread_limit(block_size) map(from : res[0 : N])
{
int tid = ompx_thread_id_x();
uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
- res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
- res[tid] = mask == 0xaaaaaaaa;
-#endif
+ if (get_warp_size() == 64)
+ res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+ else
+ res[tid] = mask == 0xaaaaaaaa;
}
for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index 9b0e66e25f68c9..311999918de857 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
#include <ompx.h>
#include <type_traits>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
bool equal(T LHS, T RHS) {
return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
{
int tid = ompx_thread_id_x();
T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
- int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
- int warp_size = 32;
-#endif
+ int warp_size = get_warp_size();
if ((tid & (warp_size - 1)) != warp_size - 1)
res[tid] = equal(val, static_cast<T>(tid + 1));
else
|
shiltian
approved these changes
Oct 21, 2024
jplehr
approved these changes
Oct 31, 2024
arsenm
reviewed
Oct 31, 2024
Contributor
arsenm
left a comment
There was a problem hiding this comment.
This is a device only context where you might as well just use the raw constant
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Summary:
This is going to be deprecated in
#112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.
Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.