Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 48 additions & 15 deletions openmp/device/include/Synchronization.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,64 @@ atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,

} // namespace atomic

// FIXME: NVPTX does not respect the memory scope argument.
namespace fence {

/// Memory fence with \p Ordering semantics for the team.
static inline void team(atomic::OrderingTy Ordering) {
#ifdef __NVPTX__
__nvvm_membar_cta();
#else
__scoped_atomic_thread_fence(Ordering, atomic::workgroup);
#endif
}

/// Memory fence with \p Ordering semantics for the contention group.
static inline void kernel(atomic::OrderingTy Ordering) {
#ifdef __NVPTX__
__nvvm_membar_gl();
#else
__scoped_atomic_thread_fence(Ordering, atomic::device);
#endif
}

/// Memory fence with \p Ordering semantics for the system.
static inline void system(atomic::OrderingTy Ordering) {
#ifdef __NVPTX__
__nvvm_membar_sys();
#else
__scoped_atomic_thread_fence(Ordering, atomic::system);
#endif
}

} // namespace fence

namespace synchronize {

/// Initialize the synchronization machinery. Must be called by all threads.
void init(bool IsSPMD);

/// Synchronize all threads in a warp identified by \p Mask.
void warp(LaneMaskTy Mask);
static inline void warp(LaneMaskTy Mask) { __gpu_sync_lane(Mask); }

/// Synchronize all threads in a block and perform a fence before and after the
/// barrier according to \p Ordering. Note that the fence might be part of the
/// barrier.
void threads(atomic::OrderingTy Ordering);
static inline void threads(atomic::OrderingTy Ordering) {
#if defined(__NVPTX__)
__nvvm_barrier_sync(8);
#elif defined(__AMDGPU__)
if (Ordering != atomic::relaxed)
fence::team(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);

__builtin_amdgcn_s_barrier();

if (Ordering != atomic::relaxed)
fence::team(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
#else
__gpu_sync_threads();
#endif
}

/// Synchronizing threads is allowed even if they all hit different instances of
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
Expand All @@ -224,19 +270,6 @@ threadsAligned(atomic::OrderingTy Ordering);

} // namespace synchronize

namespace fence {

/// Memory fence with \p Ordering semantics for the team.
void team(atomic::OrderingTy Ordering);

/// Memory fence with \p Ordering semantics for the contention group.
void kernel(atomic::OrderingTy Ordering);

/// Memory fence with \p Ordering semantics for the system.
void system(atomic::OrderingTy Ordering);

} // namespace fence

} // namespace ompx

#endif
76 changes: 19 additions & 57 deletions openmp/device/src/Synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,34 +108,9 @@ void namedBarrier() {
fence::team(atomic::release);
}

void fenceTeam(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
}

void fenceKernel(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::device);
}

void fenceSystem(atomic::OrderingTy Ordering) {
return __scoped_atomic_thread_fence(Ordering, atomic::system);
}

void syncWarp(__kmpc_impl_lanemask_t) {
// This is a no-op on current AMDGPU hardware but it is used by the optimizer
// to enforce convergent behaviour between control flow graphs.
__builtin_amdgcn_wave_barrier();
}

void syncThreads(atomic::OrderingTy Ordering) {
if (Ordering != atomic::relaxed)
fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);

__builtin_amdgcn_s_barrier();

if (Ordering != atomic::relaxed)
fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
void syncThreadsAligned(atomic::OrderingTy Ordering) {
synchronize::threads(Ordering);
}
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }

void initLock(omp_lock_t *Lock) { unsetLock(Lock); }

Expand All @@ -153,18 +128,19 @@ void setLock(omp_lock_t *Lock) {
}

void unsetCriticalLock(omp_lock_t *Lock) {
(void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
[[maybe_unused]] uint32_t before =
atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
}

void setCriticalLock(omp_lock_t *Lock) {
uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
if (mapping::getThreadIdInWarp() == LowestActiveThread) {
fenceKernel(atomic::release);
fence::kernel(atomic::release);
while (
!atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
__builtin_amdgcn_s_sleep(32);
}
fenceKernel(atomic::acquire);
fence::kernel(atomic::acquire);
}
}

Expand All @@ -188,31 +164,29 @@ void namedBarrier() {
__nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
}

void fenceTeam(int) { __nvvm_membar_cta(); }

void fenceKernel(int) { __nvvm_membar_gl(); }

void fenceSystem(int) { __nvvm_membar_sys(); }
void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }

void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
constexpr uint32_t SET = 1;

void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
__nvvm_barrier_sync(BarrierNo);
void unsetLock(omp_lock_t *Lock) {
[[maybe_unused]] uint32_t before = atomicExchange(
reinterpret_cast<uint32_t *>(Lock), UNSET, atomic::seq_cst);
}

void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }

constexpr uint32_t OMP_SPIN = 1000;
int testLock(omp_lock_t *Lock) {
return atomic::add(reinterpret_cast<uint32_t *>(Lock), 0u, atomic::seq_cst);
}

void initLock(omp_lock_t *Lock) { unsetLock(Lock); }

void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }

void setLock(omp_lock_t *Lock) {
// TODO: not sure spinning is a good idea here..
while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
atomic::seq_cst) != UNSET) {
while (atomic::cas(reinterpret_cast<uint32_t *>(Lock), UNSET, SET,
atomic::seq_cst, atomic::seq_cst) != UNSET) {
int32_t start = __nvvm_read_ptx_sreg_clock();
int32_t now;
for (;;) {
Expand Down Expand Up @@ -270,22 +244,10 @@ void synchronize::init(bool IsSPMD) {
impl::namedBarrierInit();
}

void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }

void synchronize::threads(atomic::OrderingTy Ordering) {
impl::syncThreads(Ordering);
}

void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
impl::syncThreadsAligned(Ordering);
}

void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }

void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }

void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }

void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }

void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
Expand Down Expand Up @@ -378,6 +340,6 @@ void ompx_sync_block_acq_rel() {
impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
}
void ompx_sync_block_divergent(int Ordering) {
impl::syncThreads(atomic::OrderingTy(Ordering));
synchronize::threads(atomic::OrderingTy(Ordering));
}
} // extern "C"
Loading