From 626033377be3028cb864b62d2fdb3e248e9d1759 Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Tue, 26 Oct 2021 15:18:55 +0200 Subject: [PATCH 01/27] [SYCL] Added tests for atomics with various memory orders and scopes --- SYCL/AtomicRef/add.cpp | 6 - SYCL/AtomicRef/add.h | 127 ++++++++++++++---- SYCL/AtomicRef/add_atomic64.cpp | 6 - SYCL/AtomicRef/add_orders_scopes.cpp | 49 +++++++ SYCL/AtomicRef/and.h | 96 +++++++++++++ SYCL/AtomicRef/and_orders_scopes.cpp | 42 ++++++ SYCL/AtomicRef/compare_exchange.h | 104 +++++++++++--- .../compare_exchange_orders_scopes.cpp | 46 +++++++ SYCL/AtomicRef/exchange.h | 66 ++++++++- SYCL/AtomicRef/exchange_orders_scopes.cpp | 48 +++++++ SYCL/AtomicRef/max.h | 74 +++++++++- SYCL/AtomicRef/max_orders_scopes.cpp | 42 ++++++ SYCL/AtomicRef/min.h | 68 +++++++++- SYCL/AtomicRef/min_orders_scopes.cpp | 42 ++++++ SYCL/AtomicRef/or.h | 89 ++++++++++++ SYCL/AtomicRef/or_orders_scopes.cpp | 42 ++++++ SYCL/AtomicRef/xor.h | 96 +++++++++++++ SYCL/AtomicRef/xor_orders_scopes.cpp | 42 ++++++ SYCL/Reduction/reduction_range_1d_s0_dw.cpp | 2 +- SYCL/Reduction/reduction_range_1d_s0_rw.cpp | 2 +- SYCL/Reduction/reduction_range_1d_s1_dw.cpp | 2 +- SYCL/Reduction/reduction_range_1d_s1_rw.cpp | 2 +- SYCL/Reduction/reduction_range_2d_s1_dw.cpp | 2 +- SYCL/Reduction/reduction_range_2d_s1_rw.cpp | 2 +- SYCL/Reduction/reduction_range_3d_s1_dw.cpp | 2 +- SYCL/Reduction/reduction_range_3d_s1_rw.cpp | 2 +- SYCL/Reduction/reduction_range_usm_dw.cpp | 2 +- 27 files changed, 1016 insertions(+), 87 deletions(-) create mode 100644 SYCL/AtomicRef/add_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/and.h create mode 100644 SYCL/AtomicRef/and_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/compare_exchange_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/exchange_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/max_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/min_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/or.h create mode 100644 SYCL/AtomicRef/or_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/xor.h create mode 100644 SYCL/AtomicRef/xor_orders_scopes.cpp diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index e2687fc841..64fc4fcd79 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -9,12 +9,6 @@ #include using namespace sycl; -// Floating-point types do not support pre- or post-increment -template <> void add_test(queue q, size_t N) { - add_fetch_test(q, N); - add_plus_equal_test(q, N); -} - int main() { queue q; diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h index 99f4780ce9..5a1bfd06d2 100644 --- a/SYCL/AtomicRef/add.h +++ b/SYCL/AtomicRef/add.h @@ -4,12 +4,63 @@ #include #include #include +#include #include using namespace sycl; using namespace sycl::ext::oneapi; -template +template +void add_fetch_local_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_add(Difference(1), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + sum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // Fetch returns original value: will be in [0, N-1] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == 0 && *max_e == T(N - 1)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template void add_fetch_test(queue q, size_t N) { T sum = 0; std::vector output(N); @@ -17,18 +68,20 @@ void add_fetch_test(queue q, size_t N) { { buffer sum_buf(&sum, 1); buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref(sum[0]); - out[gid] = atm.fetch_add(Difference(1)); - }); - }); + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); + out[gid] = atm.fetch_add(Difference(1), order); + }); + }).wait_and_throw(); } // All work-items increment by 1, so final value should be equal to N @@ -37,14 +90,16 @@ void add_fetch_test(queue q, size_t N) { // Fetch returns original value: will be in [0, N-1] auto min_e = std::min_element(output.begin(), output.end()); auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == T(0) && *max_e == T(N - 1)); + assert(*min_e == 0 && *max_e == T(N - 1)); // Intermediate values should be unique std::sort(output.begin(), output.end()); assert(std::unique(output.begin(), output.end()) == output.end()); } -template +template void add_plus_equal_test(queue q, size_t N) { T sum = 0; std::vector output(N); @@ -59,8 +114,11 @@ void add_plus_equal_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref(sum[0]); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); out[gid] = atm += Difference(1); }); }); @@ -79,7 +137,9 @@ void add_plus_equal_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template +template void add_pre_inc_test(queue q, size_t N) { T sum = 0; std::vector output(N); @@ -94,8 +154,11 @@ void add_pre_inc_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref(sum[0]); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); out[gid] = ++atm; }); }); @@ -114,7 +177,9 @@ void add_pre_inc_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template +template void add_post_inc_test(queue q, size_t N) { T sum = 0; std::vector output(N); @@ -129,8 +194,11 @@ void add_post_inc_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref(sum[0]); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); out[gid] = atm++; }); }); @@ -149,10 +217,15 @@ void add_post_inc_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template +template void add_test(queue q, size_t N) { - add_fetch_test(q, N); - add_plus_equal_test(q, N); - add_pre_inc_test(q, N); - add_post_inc_test(q, N); + add_fetch_local_test(q, N); + add_fetch_test(q, N); + add_plus_equal_test(q, N); + if constexpr (!std::is_floating_point_v) { + add_pre_inc_test(q, N); + add_post_inc_test(q, N); + } } diff --git a/SYCL/AtomicRef/add_atomic64.cpp b/SYCL/AtomicRef/add_atomic64.cpp index 6059a7cd7f..13d47a406a 100644 --- a/SYCL/AtomicRef/add_atomic64.cpp +++ b/SYCL/AtomicRef/add_atomic64.cpp @@ -9,12 +9,6 @@ #include using namespace sycl; -// Floating-point types do not support pre- or post-increment -template <> void add_test(queue q, size_t N) { - add_fetch_test(q, N); - add_plus_equal_test(q, N); -} - int main() { queue q; diff --git a/SYCL/AtomicRef/add_orders_scopes.cpp b/SYCL/AtomicRef/add_orders_scopes.cpp new file mode 100644 index 0000000000..6be7c66186 --- /dev/null +++ b/SYCL/AtomicRef/add_orders_scopes.cpp @@ -0,0 +1,49 @@ +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \ +// RUN: -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#define SYCL_USE_NATIVE_FP_ATOMICS + +#include "add.h" +#include +using namespace sycl; + +template +void add_test_scopes(queue q, size_t N) { + add_test(q, N); + add_test(q, N); + add_test(q, N); + add_test(q, N); +} + +template +void add_test_orders_scopes(queue q, size_t N) { + add_test_scopes(q, N); + add_test_scopes(q, N); + add_test_scopes(q, N); + add_test_scopes(q, N); +} + +int main() { + queue q; + + constexpr int N = 32; + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h new file mode 100644 index 0000000000..3d5e7db2cf --- /dev/null +++ b/SYCL/AtomicRef/and.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; + +template +void and_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = T((1ll << N) - 1); + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_and(~T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // All other values should be unique; each work-item sets one bit to 0 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void and_global_test(queue q) { + const size_t N = 32; + const T initial = T((1ll << N) - 1); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_and(~T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // All other values should be unique; each work-item sets one bit to 0 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void and_test(queue q) { + and_local_test(q); + and_global_test(q); +} \ No newline at end of file diff --git a/SYCL/AtomicRef/and_orders_scopes.cpp b/SYCL/AtomicRef/and_orders_scopes.cpp new file mode 100644 index 0000000000..1b020e6176 --- /dev/null +++ b/SYCL/AtomicRef/and_orders_scopes.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "and.h" +#include +using namespace sycl; + +template +void and_test_scopes(queue q) { + and_test(q); + and_test(q); + and_test(q); + and_test(q); +} + +template void and_test_orders_scopes(queue q) { + and_test_scopes(q); + and_test_scopes(q); + and_test_scopes(q); + and_test_scopes(q); +} + +int main() { + queue q; + + constexpr int N = 32; + and_test_orders_scopes(q); + and_test_orders_scopes(q); + and_test_orders_scopes(q); + and_test_orders_scopes(q); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + and_test_orders_scopes(q); + and_test_orders_scopes(q); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h index 04da52b81f..72107c8b18 100644 --- a/SYCL/AtomicRef/compare_exchange.h +++ b/SYCL/AtomicRef/compare_exchange.h @@ -9,9 +9,61 @@ using namespace sycl; using namespace sycl::ext::oneapi; -template class compare_exchange_kernel; +template +void compare_exchange_local_test(queue q, size_t N) { + const T initial = T(N); + T compare_exchange = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer compare_exchange_buf(&compare_exchange, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto compare_exchange = + compare_exchange_buf.template get_access( + cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + T result = T(N); // Avoid copying pointer + bool success = atm.compare_exchange_strong(result, (T)gid, order); + if (success) { + out[gid] = result; + } else { + out[gid] = T(gid); + } + it.barrier(access::fence_space::local_space); + if (gid == 0) + compare_exchange[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); -template void compare_exchange_test(queue q, size_t N) { + // All other values should be the index itself or the sentinel value + for (size_t i = 0; i < N; ++i) { + assert(output[i] == T(i) || output[i] == initial); + } +} + +template +void compare_exchange_global_test(queue q, size_t N) { const T initial = T(N); T compare_exchange = initial; std::vector output(N); @@ -21,26 +73,27 @@ template void compare_exchange_test(queue q, size_t N) { buffer output_buf(output.data(), output.size()); q.submit([&](handler &cgh) { - auto exc = - compare_exchange_buf.template get_access( - cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for>( - range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = - atomic_ref(exc[0]); - T result = T(N); // Avoid copying pointer - bool success = atm.compare_exchange_strong(result, (T)gid); - if (success) { - out[gid] = result; - } else { - out[gid] = T(gid); - } - }); - }); + auto exc = + compare_exchange_buf.template get_access( + cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (exc[0]); + T result = T(N); // Avoid copying pointer + bool success = atm.compare_exchange_strong(result, (T)gid, order); + if (success) { + out[gid] = result; + } else { + out[gid] = T(gid); + } + }); + }).wait_and_throw(); } // Only one work-item should have received the initial sentinel value @@ -51,3 +104,10 @@ template void compare_exchange_test(queue q, size_t N) { assert(output[i] == T(i) || output[i] == initial); } } + +template +void compare_exchange_test(queue q, size_t N) { + compare_exchange_local_test(q, N); + compare_exchange_global_test(q, N); +} \ No newline at end of file diff --git a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp new file mode 100644 index 0000000000..15c36c6e7c --- /dev/null +++ b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp @@ -0,0 +1,46 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "compare_exchange.h" +#include +using namespace sycl; + +template +void compare_exchange_test_scopes(queue q, size_t N) { + compare_exchange_test(q, N); + compare_exchange_test(q, N); + compare_exchange_test(q, N); + compare_exchange_test(q, N); +} + +template +void compare_exchange_test_orders_scopes(queue q, size_t N) { + compare_exchange_test_scopes(q, N); + compare_exchange_test_scopes(q, N); + compare_exchange_test_scopes(q, N); + compare_exchange_test_scopes(q, N); +} + +int main() { + queue q; + + constexpr int N = 32; + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h index a050ddaf4d..b8a2c7f81f 100644 --- a/SYCL/AtomicRef/exchange.h +++ b/SYCL/AtomicRef/exchange.h @@ -9,9 +9,53 @@ using namespace sycl; using namespace sycl::ext::oneapi; -template class exchange_kernel; +template +void exchange_local_test(queue q, size_t N) { + const T initial = T(N); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); -template void exchange_test(queue q, size_t N) { + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.exchange(T(gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // All other values should be unique; each work-item replaces the value it + // reads with its own ID + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void exchange_global_test(queue q, size_t N) { const T initial = T(N); T exchange = initial; std::vector output(N); @@ -25,11 +69,14 @@ template void exchange_test(queue q, size_t N) { exchange_buf.template get_access(cgh); auto out = output_buf.template get_access(cgh); - cgh.parallel_for>(range<1>(N), [=](item<1> it) { + cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref(exc[0]); - out[gid] = atm.exchange(T(gid)); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (exc[0]); + out[gid] = atm.exchange(T(gid), order); }); }); } @@ -42,3 +89,10 @@ template void exchange_test(queue q, size_t N) { std::sort(output.begin(), output.end()); assert(std::unique(output.begin(), output.end()) == output.end()); } + +template +void exchange_test(queue q, size_t N) { + exchange_local_test(q, N); + exchange_global_test(q, N); +} \ No newline at end of file diff --git a/SYCL/AtomicRef/exchange_orders_scopes.cpp b/SYCL/AtomicRef/exchange_orders_scopes.cpp new file mode 100644 index 0000000000..d3a9ee2c1d --- /dev/null +++ b/SYCL/AtomicRef/exchange_orders_scopes.cpp @@ -0,0 +1,48 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "exchange.h" +#include +using namespace sycl; + +template +void exchange_test_scopes(queue q, size_t N) { + exchange_test(q, N); + exchange_test(q, N); + exchange_test(q, N); + exchange_test(q, N); +} + +template void exchange_test_orders_scopes(queue q, size_t N) { + exchange_test_scopes(q, N); + exchange_test_scopes(q, N); + exchange_test_scopes(q, N); + exchange_test_scopes(q, N); +} + +int main() { + queue q; + + constexpr int N = 32; + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + + exchange_test_orders_scopes(q, N); + + exchange_test_orders_scopes(q, N); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h index 4da562e8e4..d284a60096 100644 --- a/SYCL/AtomicRef/max.h +++ b/SYCL/AtomicRef/max.h @@ -9,7 +9,57 @@ using namespace sycl; using namespace sycl::ext::oneapi; -template void max_test(queue q, size_t N) { +template +void max_local_test(queue q, size_t N) { + T initial = std::numeric_limits::lowest(); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = + atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + assert(cum == N - 1 + std::numeric_limits::max() / 2); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_max returns original value + // Intermediate values should all be >= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] >= initial); + } +} + +template +void max_global_test(queue q, size_t N) { T initial = std::numeric_limits::lowest(); T val = initial; std::vector output(N); @@ -24,17 +74,20 @@ template void max_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref(val[0]); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (val[0]); - // +1 accounts for lowest() returning 0 for unsigned types - out[gid] = atm.fetch_max(T(gid) + 1); + // +max/2 to ensure correct signed/unsigned operation is applied + out[gid] = + atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); }); }); } - // Final value should be equal to N - assert(val == N); + assert(val == N - 1 + std::numeric_limits::max() / 2); // Only one work-item should have received the initial value assert(std::count(output.begin(), output.end(), initial) == 1); @@ -45,3 +98,10 @@ template void max_test(queue q, size_t N) { assert(output[i] >= initial); } } + +template +void max_test(queue q, size_t N) { + max_local_test(q, N); + max_global_test(q, N); +} diff --git a/SYCL/AtomicRef/max_orders_scopes.cpp b/SYCL/AtomicRef/max_orders_scopes.cpp new file mode 100644 index 0000000000..0320be5969 --- /dev/null +++ b/SYCL/AtomicRef/max_orders_scopes.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "max.h" +#include +using namespace sycl; + +template +void max_test_scopes(queue q, size_t N) { + max_test(q, N); + max_test(q, N); + max_test(q, N); + max_test(q, N); +} + +template void max_test_orders_scopes(queue q, size_t N) { + max_test_scopes(q, N); + max_test_scopes(q, N); + max_test_scopes(q, N); + max_test_scopes(q, N); +} + +int main() { + queue q; + + constexpr int N = 32; + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h index a493cd8840..ac5d32bebf 100644 --- a/SYCL/AtomicRef/min.h +++ b/SYCL/AtomicRef/min.h @@ -9,7 +9,57 @@ using namespace sycl; using namespace sycl::ext::oneapi; -template void min_test(queue q, size_t N) { +template +void min_local_test(queue q, size_t N) { + T initial = std::numeric_limits::max(); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_min(T(gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_min returns original value + // Intermediate values should all be <= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] <= initial); + } +} + +template +void min_global_test(queue q, size_t N) { T initial = std::numeric_limits::max(); T val = initial; std::vector output(N); @@ -24,9 +74,12 @@ template void min_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref(val[0]); - out[gid] = atm.fetch_min(T(gid)); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (val[0]); + out[gid] = atm.fetch_min(T(gid), order); }); }); } @@ -43,3 +96,10 @@ template void min_test(queue q, size_t N) { assert(output[i] <= initial); } } + +template +void min_test(queue q, size_t N) { + min_local_test(q, N); + min_global_test(q, N); +} \ No newline at end of file diff --git a/SYCL/AtomicRef/min_orders_scopes.cpp b/SYCL/AtomicRef/min_orders_scopes.cpp new file mode 100644 index 0000000000..fdf7c620a5 --- /dev/null +++ b/SYCL/AtomicRef/min_orders_scopes.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "min.h" +#include +using namespace sycl; + +template +void min_test_scopes(queue q, size_t N) { + min_test(q, N); + min_test(q, N); + min_test(q, N); + min_test(q, N); +} + +template void min_test_orders_scopes(queue q, size_t N) { + min_test_scopes(q, N); + min_test_scopes(q, N); + min_test_scopes(q, N); + min_test_scopes(q, N); +} + +int main() { + queue q; + + constexpr int N = 32; + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h new file mode 100644 index 0000000000..fa28b2f257 --- /dev/null +++ b/SYCL/AtomicRef/or.h @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; + +template +void or_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_or(T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each work-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void or_test(queue q) { + const size_t N = 32; + const T initial = 0; + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_or(T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each work-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} diff --git a/SYCL/AtomicRef/or_orders_scopes.cpp b/SYCL/AtomicRef/or_orders_scopes.cpp new file mode 100644 index 0000000000..2adad06a45 --- /dev/null +++ b/SYCL/AtomicRef/or_orders_scopes.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "or.h" +#include +using namespace sycl; + +template +void or_test_scopes(queue q) { + or_test(q); + or_test(q); + or_test(q); + or_test(q); +} + +template void or_test_orders_scopes(queue q) { + or_test_scopes(q); + or_test_scopes(q); + or_test_scopes(q); + or_test_scopes(q); +} + +int main() { + queue q; + + constexpr int N = 32; + or_test_orders_scopes(q); + or_test_orders_scopes(q); + or_test_orders_scopes(q); + or_test_orders_scopes(q); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + or_test_orders_scopes(q); + or_test_orders_scopes(q); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h new file mode 100644 index 0000000000..d928d81453 --- /dev/null +++ b/SYCL/AtomicRef/xor.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; + +template +void xor_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_xor(T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each wxork-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void xor_global_test(queue q) { + const size_t N = 32; + const T initial = 0; + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_xor(T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each wxork-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void xor_test(queue q) { + xor_local_test(q); + xor_global_test(q); +} \ No newline at end of file diff --git a/SYCL/AtomicRef/xor_orders_scopes.cpp b/SYCL/AtomicRef/xor_orders_scopes.cpp new file mode 100644 index 0000000000..3bd70c0550 --- /dev/null +++ b/SYCL/AtomicRef/xor_orders_scopes.cpp @@ -0,0 +1,42 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "xor.h" +#include +using namespace sycl; + +template +void xor_test_scopes(queue q) { + xor_test(q); + xor_test(q); + xor_test(q); + xor_test(q); +} + +template void xor_test_orders_scopes(queue q) { + xor_test_scopes(q); + xor_test_scopes(q); + xor_test_scopes(q); + xor_test_scopes(q); +} + +int main() { + queue q; + + constexpr int N = 32; + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + } + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp index d62c30e8b3..0725df16fb 100644 --- a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp index 74317e6d41..a8ff39fcdb 100644 --- a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp index 8bfffbc3e3..fafa75755b 100644 --- a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp index 359aa2f0fe..63dcd53ed3 100644 --- a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp index 22b2d32103..4b85b529b3 100644 --- a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp index b2fb2ba14d..057939f733 100644 --- a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp index c4b7a4ab6e..fdd26d3e91 100644 --- a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp index 79bc4eed55..048f6075f7 100644 --- a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Reduction/reduction_range_usm_dw.cpp b/SYCL/Reduction/reduction_range_usm_dw.cpp index e50626464e..950142f74e 100644 --- a/SYCL/Reduction/reduction_range_usm_dw.cpp +++ b/SYCL/Reduction/reduction_range_usm_dw.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out From a900c8fc40293a3431da938c3b9bdba5924b2cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 18 Nov 2021 15:01:20 +0000 Subject: [PATCH 02/27] [SYCL] merged add tests into one file --- SYCL/AtomicRef/add.cpp | 459 ++++++++++++++++++++++++++- SYCL/AtomicRef/add.h | 231 -------------- SYCL/AtomicRef/add_atomic64.cpp | 41 --- SYCL/AtomicRef/add_orders_scopes.cpp | 49 --- 4 files changed, 445 insertions(+), 335 deletions(-) delete mode 100644 SYCL/AtomicRef/add.h delete mode 100644 SYCL/AtomicRef/add_atomic64.cpp delete mode 100644 SYCL/AtomicRef/add_orders_scopes.cpp diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index 64fc4fcd79..57c2fe488d 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -1,32 +1,463 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \ -// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "add.h" +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#define SYCL_USE_NATIVE_FP_ATOMICS + +#include +#include +#include +#include +#include +#include #include + using namespace sycl; +template +void add_fetch_local_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_add(Difference(1), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + sum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // Fetch returns original value: will be in [0, N-1] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == 0 && *max_e == T(N - 1)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void add_fetch_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); + out[gid] = atm.fetch_add(Difference(1), order); + }); + }).wait_and_throw(); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // Fetch returns original value: will be in [0, N-1] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == 0 && *max_e == T(N - 1)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void add_plus_equal_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); + out[gid] = atm += Difference(1); + }); + }); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // += returns updated value: will be in [1, N] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == T(1) && *max_e == T(N)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void add_pre_inc_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); + out[gid] = ++atm; + }); + }); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // Pre-increment returns updated value: will be in [1, N] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == T(1) && *max_e == T(N)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void add_post_inc_test(queue q, size_t N) { + T sum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer sum_buf(&sum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (sum[0]); + out[gid] = atm++; + }); + }); + } + + // All work-items increment by 1, so final value should be equal to N + assert(sum == T(N)); + + // Post-increment returns original value: will be in [0, N-1] + auto min_e = std::min_element(output.begin(), output.end()); + auto max_e = std::max_element(output.begin(), output.end()); + assert(*min_e == T(0) && *max_e == T(N - 1)); + + // Intermediate values should be unique + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void add_test(queue q, size_t N) { + add_fetch_local_test(q, N); + add_fetch_test(q, N); + add_plus_equal_test(q, N); + if constexpr (!std::is_floating_point_v) { + add_pre_inc_test(q, N); + add_post_inc_test(q, N); + } +} + +template +void add_test_scopes(queue q, size_t N) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test(q,N); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test(q,N); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test(q,N); +#else + add_test(q,N); +#endif +} + +template +void + add_test_orders_scopes(queue q, size_t N) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test_scopes(q,N); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test_scopes(q,N); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + add_test_scopes(q,N); +#else + add_test_scopes(q,N); +#endif +} + int main() { queue q; constexpr int N = 32; - add_test(q, N); - add_test(q, N); - add_test(q, N); - - // Include long tests if they are 32 bits wide - if constexpr (sizeof(long) == 4) { - add_test(q, N); - add_test(q, N); +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + if constexpr (sizeof(long) == 8) { + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); } - // Include pointer tests if they are 32 bits wide - if constexpr (sizeof(char *) == 4) { - add_test(q, N); + // Include long long tests if they are 64 bits wide + if constexpr (sizeof(long long) == 8) { + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); } +#else + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + if constexpr (sizeof(long) == 4) { + add_test_orders_scopes(q, N); + add_test_orders_scopes(q, N); + } +#endif + std::cout << "Test passed." << std::endl; } + diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h deleted file mode 100644 index 5a1bfd06d2..0000000000 --- a/SYCL/AtomicRef/add.h +++ /dev/null @@ -1,231 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void add_fetch_local_test(queue q, size_t N) { - T sum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer sum_buf(&sum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = 0; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.fetch_add(Difference(1), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - sum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // All work-items increment by 1, so final value should be equal to N - assert(sum == T(N)); - - // Fetch returns original value: will be in [0, N-1] - auto min_e = std::min_element(output.begin(), output.end()); - auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == 0 && *max_e == T(N - 1)); - - // Intermediate values should be unique - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void add_fetch_test(queue q, size_t N) { - T sum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer sum_buf(&sum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (sum[0]); - out[gid] = atm.fetch_add(Difference(1), order); - }); - }).wait_and_throw(); - } - - // All work-items increment by 1, so final value should be equal to N - assert(sum == T(N)); - - // Fetch returns original value: will be in [0, N-1] - auto min_e = std::min_element(output.begin(), output.end()); - auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == 0 && *max_e == T(N - 1)); - - // Intermediate values should be unique - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void add_plus_equal_test(queue q, size_t N) { - T sum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer sum_buf(&sum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (sum[0]); - out[gid] = atm += Difference(1); - }); - }); - } - - // All work-items increment by 1, so final value should be equal to N - assert(sum == T(N)); - - // += returns updated value: will be in [1, N] - auto min_e = std::min_element(output.begin(), output.end()); - auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == T(1) && *max_e == T(N)); - - // Intermediate values should be unique - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void add_pre_inc_test(queue q, size_t N) { - T sum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer sum_buf(&sum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (sum[0]); - out[gid] = ++atm; - }); - }); - } - - // All work-items increment by 1, so final value should be equal to N - assert(sum == T(N)); - - // Pre-increment returns updated value: will be in [1, N] - auto min_e = std::min_element(output.begin(), output.end()); - auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == T(1) && *max_e == T(N)); - - // Intermediate values should be unique - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void add_post_inc_test(queue q, size_t N) { - T sum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer sum_buf(&sum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto sum = sum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (sum[0]); - out[gid] = atm++; - }); - }); - } - - // All work-items increment by 1, so final value should be equal to N - assert(sum == T(N)); - - // Post-increment returns original value: will be in [0, N-1] - auto min_e = std::min_element(output.begin(), output.end()); - auto max_e = std::max_element(output.begin(), output.end()); - assert(*min_e == T(0) && *max_e == T(N - 1)); - - // Intermediate values should be unique - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void add_test(queue q, size_t N) { - add_fetch_local_test(q, N); - add_fetch_test(q, N); - add_plus_equal_test(q, N); - if constexpr (!std::is_floating_point_v) { - add_pre_inc_test(q, N); - add_post_inc_test(q, N); - } -} diff --git a/SYCL/AtomicRef/add_atomic64.cpp b/SYCL/AtomicRef/add_atomic64.cpp deleted file mode 100644 index 13d47a406a..0000000000 --- a/SYCL/AtomicRef/add_atomic64.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \ -// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "add.h" -#include -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - add_test(q, N); - - // Include long tests if they are 64 bits wide - if constexpr (sizeof(long) == 8) { - add_test(q, N); - add_test(q, N); - } - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - add_test(q, N); - add_test(q, N); - } - - // Include pointer tests if they are 64 bits wide - if constexpr (sizeof(char *) == 8) { - add_test(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/add_orders_scopes.cpp b/SYCL/AtomicRef/add_orders_scopes.cpp deleted file mode 100644 index 6be7c66186..0000000000 --- a/SYCL/AtomicRef/add_orders_scopes.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \ -// RUN: -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#define SYCL_USE_NATIVE_FP_ATOMICS - -#include "add.h" -#include -using namespace sycl; - -template -void add_test_scopes(queue q, size_t N) { - add_test(q, N); - add_test(q, N); - add_test(q, N); - add_test(q, N); -} - -template -void add_test_orders_scopes(queue q, size_t N) { - add_test_scopes(q, N); - add_test_scopes(q, N); - add_test_scopes(q, N); - add_test_scopes(q, N); -} - -int main() { - queue q; - - constexpr int N = 32; - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); - } - - std::cout << "Test passed." << std::endl; -} From d7f7e345ac8291fe1f8f2519101dc1e594853ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 19 Nov 2021 11:29:04 +0000 Subject: [PATCH 03/27] [SYCL] merged tests for other operations --- SYCL/AtomicRef/add.cpp | 15 +- SYCL/AtomicRef/and.cpp | 321 +++++++++++++++++ SYCL/AtomicRef/and.h | 96 ----- SYCL/AtomicRef/and_orders_scopes.cpp | 42 --- SYCL/AtomicRef/compare_exchange.cpp | 339 +++++++++++++++++- SYCL/AtomicRef/compare_exchange.h | 113 ------ SYCL/AtomicRef/compare_exchange_atomic64.cpp | 40 --- .../compare_exchange_orders_scopes.cpp | 46 --- SYCL/AtomicRef/exchange.cpp | 323 ++++++++++++++++- SYCL/AtomicRef/exchange.h | 98 ----- SYCL/AtomicRef/exchange_atomic64.cpp | 40 --- SYCL/AtomicRef/exchange_orders_scopes.cpp | 48 --- SYCL/AtomicRef/max.cpp | 327 ++++++++++++++++- SYCL/AtomicRef/max.h | 107 ------ SYCL/AtomicRef/max_atomic64.cpp | 35 -- SYCL/AtomicRef/max_orders_scopes.cpp | 42 --- SYCL/AtomicRef/min.cpp | 325 ++++++++++++++++- SYCL/AtomicRef/min.h | 105 ------ SYCL/AtomicRef/min_atomic64.cpp | 35 -- SYCL/AtomicRef/min_orders_scopes.cpp | 42 --- SYCL/AtomicRef/or.cpp | 322 +++++++++++++++++ SYCL/AtomicRef/or.h | 89 ----- SYCL/AtomicRef/or_orders_scopes.cpp | 42 --- SYCL/AtomicRef/xor.cpp | 322 +++++++++++++++++ SYCL/AtomicRef/xor.h | 96 ----- SYCL/AtomicRef/xor_orders_scopes.cpp | 42 --- 26 files changed, 2250 insertions(+), 1202 deletions(-) create mode 100644 SYCL/AtomicRef/and.cpp delete mode 100644 SYCL/AtomicRef/and.h delete mode 100644 SYCL/AtomicRef/and_orders_scopes.cpp delete mode 100644 SYCL/AtomicRef/compare_exchange.h delete mode 100644 SYCL/AtomicRef/compare_exchange_atomic64.cpp delete mode 100644 SYCL/AtomicRef/compare_exchange_orders_scopes.cpp delete mode 100644 SYCL/AtomicRef/exchange.h delete mode 100644 SYCL/AtomicRef/exchange_atomic64.cpp delete mode 100644 SYCL/AtomicRef/exchange_orders_scopes.cpp delete mode 100644 SYCL/AtomicRef/max.h delete mode 100644 SYCL/AtomicRef/max_atomic64.cpp delete mode 100644 SYCL/AtomicRef/max_orders_scopes.cpp delete mode 100644 SYCL/AtomicRef/min.h delete mode 100644 SYCL/AtomicRef/min_atomic64.cpp delete mode 100644 SYCL/AtomicRef/min_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/or.cpp delete mode 100644 SYCL/AtomicRef/or.h delete mode 100644 SYCL/AtomicRef/or_orders_scopes.cpp create mode 100644 SYCL/AtomicRef/xor.cpp delete mode 100644 SYCL/AtomicRef/xor.h delete mode 100644 SYCL/AtomicRef/xor_orders_scopes.cpp diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index 57c2fe488d..af0db82b8c 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -1,3 +1,6 @@ +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -436,28 +439,32 @@ int main() { std::cout << "Skipping test\n"; return 0; } + + add_test_orders_scopes(q, N); if constexpr (sizeof(long) == 8) { add_test_orders_scopes(q, N); add_test_orders_scopes(q, N); } - - // Include long long tests if they are 64 bits wide if constexpr (sizeof(long long) == 8) { add_test_orders_scopes(q, N); add_test_orders_scopes(q, N); } + if constexpr (sizeof(char *) == 8) { + add_test(q, N); + } #else add_test_orders_scopes(q, N); add_test_orders_scopes(q, N); add_test_orders_scopes(q, N); - add_test_orders_scopes(q, N); if constexpr (sizeof(long) == 4) { add_test_orders_scopes(q, N); add_test_orders_scopes(q, N); } + if constexpr (sizeof(char *) == 4) { + add_test_orders_scopes(q, N); + } #endif - std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp new file mode 100644 index 0000000000..89355aa9e4 --- /dev/null +++ b/SYCL/AtomicRef/and.cpp @@ -0,0 +1,321 @@ +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template +void and_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = T((1ll << N) - 1); + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_and(~T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // All other values should be unique; each work-item sets one bit to 0 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void and_global_test(queue q) { + const size_t N = 32; + const T initial = T((1ll << N) - 1); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_and(~T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // All other values should be unique; each work-item sets one bit to 0 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void and_test(queue q) { + and_local_test(q); + and_global_test(q); +} + +template +void and_test_scopes(queue q) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test(q); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test(q); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test(q); +#else + and_test(q); +#endif +} + +template void and_test_orders_scopes(queue q) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test_scopes(q); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test_scopes(q); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + and_test_scopes(q); +#else + and_test_scopes(q); +#endif +} + +int main() { + queue q; + +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + if constexpr (sizeof(long) == 8) { + and_test_orders_scopes(q); + and_test_orders_scopes(q); + } + if constexpr (sizeof(long long) == 8) { + and_test_orders_scopes(q); + and_test_orders_scopes(q); + } +#else + and_test_orders_scopes(q); + and_test_orders_scopes(q); + if constexpr (sizeof(long) == 4) { + and_test_orders_scopes(q); + and_test_orders_scopes(q); + } +#endif + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h deleted file mode 100644 index 3d5e7db2cf..0000000000 --- a/SYCL/AtomicRef/and.h +++ /dev/null @@ -1,96 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void and_local_test(queue q) { - const size_t N = 32; - T cum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = T((1ll << N) - 1); - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.fetch_and(~T(1ll << gid), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Final value should be equal to 0 - assert(cum == 0); - - // All other values should be unique; each work-item sets one bit to 0 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void and_global_test(queue q) { - const size_t N = 32; - const T initial = T((1ll << N) - 1); - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (cum[0]); - out[gid] = atm.fetch_and(~T(1ll << gid), order); - }); - }); - } - - // Final value should be equal to 0 - assert(cum == 0); - - // All other values should be unique; each work-item sets one bit to 0 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void and_test(queue q) { - and_local_test(q); - and_global_test(q); -} \ No newline at end of file diff --git a/SYCL/AtomicRef/and_orders_scopes.cpp b/SYCL/AtomicRef/and_orders_scopes.cpp deleted file mode 100644 index 1b020e6176..0000000000 --- a/SYCL/AtomicRef/and_orders_scopes.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "and.h" -#include -using namespace sycl; - -template -void and_test_scopes(queue q) { - and_test(q); - and_test(q); - and_test(q); - and_test(q); -} - -template void and_test_orders_scopes(queue q) { - and_test_scopes(q); - and_test_scopes(q); - and_test_scopes(q); - and_test_scopes(q); -} - -int main() { - queue q; - - constexpr int N = 32; - and_test_orders_scopes(q); - and_test_orders_scopes(q); - and_test_orders_scopes(q); - and_test_orders_scopes(q); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - and_test_orders_scopes(q); - and_test_orders_scopes(q); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp index 278f74e8c8..3df693f173 100644 --- a/SYCL/AtomicRef/compare_exchange.cpp +++ b/SYCL/AtomicRef/compare_exchange.cpp @@ -1,31 +1,348 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM // RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "compare_exchange.h" +#include +#include +#include +#include +#include #include + using namespace sycl; +template +void compare_exchange_local_test(queue q, size_t N) { + const T initial = T(N); + T compare_exchange = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer compare_exchange_buf(&compare_exchange, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto compare_exchange = + compare_exchange_buf.template get_access( + cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + T result = T(N); // Avoid copying pointer + bool success = atm.compare_exchange_strong(result, (T)gid, order); + if (success) { + out[gid] = result; + } else { + out[gid] = T(gid); + } + it.barrier(access::fence_space::local_space); + if (gid == 0) + compare_exchange[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // All other values should be the index itself or the sentinel value + for (size_t i = 0; i < N; ++i) { + assert(output[i] == T(i) || output[i] == initial); + } +} + +template +void compare_exchange_global_test(queue q, size_t N) { + const T initial = T(N); + T compare_exchange = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer compare_exchange_buf(&compare_exchange, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto exc = + compare_exchange_buf.template get_access( + cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (exc[0]); + T result = T(N); // Avoid copying pointer + bool success = atm.compare_exchange_strong(result, (T)gid, order); + if (success) { + out[gid] = result; + } else { + out[gid] = T(gid); + } + }); + }).wait_and_throw(); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // All other values should be the index itself or the sentinel value + for (size_t i = 0; i < N; ++i) { + assert(output[i] == T(i) || output[i] == initial); + } +} + +template +void compare_exchange_test(queue q, size_t N) { + compare_exchange_local_test(q, N); + compare_exchange_global_test(q, N); +} + +template +void compare_exchange_test_scopes(queue q, size_t N) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test(q,N); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test(q,N); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test(q,N); +#else + compare_exchange_test(q,N); +#endif +} + +template +void compare_exchange_test_orders_scopes(queue q, size_t N) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test_scopes(q,N); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test_scopes(q,N); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + compare_exchange_test_scopes(q,N); +#else + compare_exchange_test_scopes(q,N); +#endif +} + int main() { queue q; constexpr int N = 32; - compare_exchange_test(q, N); - compare_exchange_test(q, N); - compare_exchange_test(q, N); +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + compare_exchange_test_orders_scopes(q, N); + if constexpr (sizeof(long) == 8) { + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + } + if constexpr (sizeof(long long) == 8) { + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + } + if constexpr (sizeof(char *) == 8) { + compare_exchange_test_orders_scopes(q, N); + } +#else + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); - // Include long tests if they are 32 bits wide if constexpr (sizeof(long) == 4) { - compare_exchange_test(q, N); - compare_exchange_test(q, N); + compare_exchange_test_orders_scopes(q, N); + compare_exchange_test_orders_scopes(q, N); } - - // Include pointer tests if they are 32 bits wide if constexpr (sizeof(char *) == 4) { - compare_exchange_test(q, N); + compare_exchange_test_orders_scopes(q, N); } +#endif std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h deleted file mode 100644 index 72107c8b18..0000000000 --- a/SYCL/AtomicRef/compare_exchange.h +++ /dev/null @@ -1,113 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void compare_exchange_local_test(queue q, size_t N) { - const T initial = T(N); - T compare_exchange = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer compare_exchange_buf(&compare_exchange, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto compare_exchange = - compare_exchange_buf.template get_access( - cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = initial; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - T result = T(N); // Avoid copying pointer - bool success = atm.compare_exchange_strong(result, (T)gid, order); - if (success) { - out[gid] = result; - } else { - out[gid] = T(gid); - } - it.barrier(access::fence_space::local_space); - if (gid == 0) - compare_exchange[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Only one work-item should have received the initial sentinel value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // All other values should be the index itself or the sentinel value - for (size_t i = 0; i < N; ++i) { - assert(output[i] == T(i) || output[i] == initial); - } -} - -template -void compare_exchange_global_test(queue q, size_t N) { - const T initial = T(N); - T compare_exchange = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer compare_exchange_buf(&compare_exchange, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto exc = - compare_exchange_buf.template get_access( - cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (exc[0]); - T result = T(N); // Avoid copying pointer - bool success = atm.compare_exchange_strong(result, (T)gid, order); - if (success) { - out[gid] = result; - } else { - out[gid] = T(gid); - } - }); - }).wait_and_throw(); - } - - // Only one work-item should have received the initial sentinel value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // All other values should be the index itself or the sentinel value - for (size_t i = 0; i < N; ++i) { - assert(output[i] == T(i) || output[i] == initial); - } -} - -template -void compare_exchange_test(queue q, size_t N) { - compare_exchange_local_test(q, N); - compare_exchange_global_test(q, N); -} \ No newline at end of file diff --git a/SYCL/AtomicRef/compare_exchange_atomic64.cpp b/SYCL/AtomicRef/compare_exchange_atomic64.cpp deleted file mode 100644 index 600b0920e4..0000000000 --- a/SYCL/AtomicRef/compare_exchange_atomic64.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "compare_exchange.h" -#include -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - compare_exchange_test(q, N); - - // Include long tests if they are 64 bits wide - if constexpr (sizeof(long) == 8) { - compare_exchange_test(q, N); - compare_exchange_test(q, N); - } - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - compare_exchange_test(q, N); - compare_exchange_test(q, N); - } - - // Include pointer tests if they are 64 bits wide - if constexpr (sizeof(char *) == 8) { - compare_exchange_test(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp deleted file mode 100644 index 15c36c6e7c..0000000000 --- a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "compare_exchange.h" -#include -using namespace sycl; - -template -void compare_exchange_test_scopes(queue q, size_t N) { - compare_exchange_test(q, N); - compare_exchange_test(q, N); - compare_exchange_test(q, N); - compare_exchange_test(q, N); -} - -template -void compare_exchange_test_orders_scopes(queue q, size_t N) { - compare_exchange_test_scopes(q, N); - compare_exchange_test_scopes(q, N); - compare_exchange_test_scopes(q, N); - compare_exchange_test_scopes(q, N); -} - -int main() { - queue q; - - constexpr int N = 32; - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - compare_exchange_test_orders_scopes(q, N); - compare_exchange_test_orders_scopes(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp index 0c250bbf11..805c3ede18 100644 --- a/SYCL/AtomicRef/exchange.cpp +++ b/SYCL/AtomicRef/exchange.cpp @@ -1,31 +1,332 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 // RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "exchange.h" +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include #include + using namespace sycl; +template +void exchange_local_test(queue q, size_t N) { + const T initial = T(N); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.exchange(T(gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // All other values should be unique; each work-item replaces the value it + // reads with its own ID + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void exchange_global_test(queue q, size_t N) { + const T initial = T(N); + T exchange = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer exchange_buf(&exchange, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto exc = + exchange_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (exc[0]); + out[gid] = atm.exchange(T(gid), order); + }); + }); + } + + // Only one work-item should have received the initial sentinel value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // All other values should be unique; each work-item replaces the value it + // reads with its own ID + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void exchange_test(queue q, size_t N) { + exchange_local_test(q, N); + exchange_global_test(q, N); +} + +template +void exchange_test_scopes(queue q, size_t N) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test(q,N); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test(q,N); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test(q,N); +#else + exchange_test(q,N); +#endif +} + +template void exchange_test_orders_scopes(queue q, size_t N) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test_scopes(q, N); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test_scopes(q, N); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + exchange_test_scopes(q, N); +#else + exchange_test_scopes(q, N); +#endif +} + int main() { queue q; constexpr int N = 32; - exchange_test(q, N); - exchange_test(q, N); - exchange_test(q, N); +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + exchange_test_orders_scopes(q, N); + if constexpr (sizeof(long) == 8) { + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + } + if constexpr (sizeof(long long) == 8) { + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + } + if constexpr (sizeof(char *) == 8) { + exchange_test_orders_scopes(q, N); + } +#else + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); - // Include long tests if they are 32 bits wide if constexpr (sizeof(long) == 4) { - exchange_test(q, N); - exchange_test(q, N); + exchange_test_orders_scopes(q, N); + exchange_test_orders_scopes(q, N); } - - // Include pointer tests if they are 32 bits wide if constexpr (sizeof(char *) == 4) { - exchange_test(q, N); + exchange_test_orders_scopes(q, N); } +#endif std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h deleted file mode 100644 index b8a2c7f81f..0000000000 --- a/SYCL/AtomicRef/exchange.h +++ /dev/null @@ -1,98 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void exchange_local_test(queue q, size_t N) { - const T initial = T(N); - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = initial; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.exchange(T(gid), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Only one work-item should have received the initial sentinel value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // All other values should be unique; each work-item replaces the value it - // reads with its own ID - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void exchange_global_test(queue q, size_t N) { - const T initial = T(N); - T exchange = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer exchange_buf(&exchange, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto exc = - exchange_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (exc[0]); - out[gid] = atm.exchange(T(gid), order); - }); - }); - } - - // Only one work-item should have received the initial sentinel value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // All other values should be unique; each work-item replaces the value it - // reads with its own ID - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void exchange_test(queue q, size_t N) { - exchange_local_test(q, N); - exchange_global_test(q, N); -} \ No newline at end of file diff --git a/SYCL/AtomicRef/exchange_atomic64.cpp b/SYCL/AtomicRef/exchange_atomic64.cpp deleted file mode 100644 index 50d665a355..0000000000 --- a/SYCL/AtomicRef/exchange_atomic64.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "exchange.h" -#include -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - exchange_test(q, N); - - // Include long tests if they are 64 bits wide - if constexpr (sizeof(long) == 8) { - exchange_test(q, N); - exchange_test(q, N); - } - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - exchange_test(q, N); - exchange_test(q, N); - } - - // Include pointer tests if they are 64 bits wide - if constexpr (sizeof(char *) == 8) { - exchange_test(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/exchange_orders_scopes.cpp b/SYCL/AtomicRef/exchange_orders_scopes.cpp deleted file mode 100644 index d3a9ee2c1d..0000000000 --- a/SYCL/AtomicRef/exchange_orders_scopes.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "exchange.h" -#include -using namespace sycl; - -template -void exchange_test_scopes(queue q, size_t N) { - exchange_test(q, N); - exchange_test(q, N); - exchange_test(q, N); - exchange_test(q, N); -} - -template void exchange_test_orders_scopes(queue q, size_t N) { - exchange_test_scopes(q, N); - exchange_test_scopes(q, N); - exchange_test_scopes(q, N); - exchange_test_scopes(q, N); -} - -int main() { - queue q; - - constexpr int N = 32; - exchange_test_orders_scopes(q, N); - exchange_test_orders_scopes(q, N); - exchange_test_orders_scopes(q, N); - - exchange_test_orders_scopes(q, N); - exchange_test_orders_scopes(q, N); - - exchange_test_orders_scopes(q, N); - - exchange_test_orders_scopes(q, N); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - exchange_test_orders_scopes(q, N); - exchange_test_orders_scopes(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp index 0e3517f922..81f1947fc9 100644 --- a/SYCL/AtomicRef/max.cpp +++ b/SYCL/AtomicRef/max.cpp @@ -1,26 +1,335 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "max.h" +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include #include + using namespace sycl; +template +void max_local_test(queue q, size_t N) { + T initial = std::numeric_limits::lowest(); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = + atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + assert(cum == N - 1 + std::numeric_limits::max() / 2); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_max returns original value + // Intermediate values should all be >= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] >= initial); + } +} + +template +void max_global_test(queue q, size_t N) { + T initial = std::numeric_limits::lowest(); + T val = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), std::numeric_limits::max()); + { + buffer val_buf(&val, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto val = val_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (val[0]); + + // +max/2 to ensure correct signed/unsigned operation is applied + out[gid] = + atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); + }); + }); + } + + assert(val == N - 1 + std::numeric_limits::max() / 2); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_max returns original value + // Intermediate values should all be >= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] >= initial); + } +} + +template +void max_test(queue q, size_t N) { + max_local_test(q, N); + max_global_test(q, N); +} + +template +void max_test_scopes(queue q, size_t N) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test(q,N); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test(q,N); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test(q,N); +#else + max_test(q,N); +#endif +} + +template void max_test_orders_scopes(queue q, size_t N) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test_scopes(q,N); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test_scopes(q,N); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + max_test_scopes(q,N); +#else + max_test_scopes(q,N); +#endif +} + int main() { queue q; constexpr int N = 32; - max_test(q, N); - max_test(q, N); - max_test(q, N); - - // Include long tests if they are 32 bits wide +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + max_test_orders_scopes(q, N); + if constexpr (sizeof(long) == 8) { + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + } + if constexpr (sizeof(long long) == 8) { + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + } +#else + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); if constexpr (sizeof(long) == 4) { - max_test(q, N); - max_test(q, N); + max_test_orders_scopes(q, N); + max_test_orders_scopes(q, N); } +#endif std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h deleted file mode 100644 index d284a60096..0000000000 --- a/SYCL/AtomicRef/max.h +++ /dev/null @@ -1,107 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void max_local_test(queue q, size_t N) { - T initial = std::numeric_limits::lowest(); - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = initial; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = - atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - assert(cum == N - 1 + std::numeric_limits::max() / 2); - - // Only one work-item should have received the initial value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // fetch_max returns original value - // Intermediate values should all be >= initial value - for (int i = 0; i < N; ++i) { - assert(output[i] >= initial); - } -} - -template -void max_global_test(queue q, size_t N) { - T initial = std::numeric_limits::lowest(); - T val = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), std::numeric_limits::max()); - { - buffer val_buf(&val, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto val = val_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (val[0]); - - // +max/2 to ensure correct signed/unsigned operation is applied - out[gid] = - atm.fetch_max(T(gid) + std::numeric_limits::max() / 2, order); - }); - }); - } - - assert(val == N - 1 + std::numeric_limits::max() / 2); - - // Only one work-item should have received the initial value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // fetch_max returns original value - // Intermediate values should all be >= initial value - for (int i = 0; i < N; ++i) { - assert(output[i] >= initial); - } -} - -template -void max_test(queue q, size_t N) { - max_local_test(q, N); - max_global_test(q, N); -} diff --git a/SYCL/AtomicRef/max_atomic64.cpp b/SYCL/AtomicRef/max_atomic64.cpp deleted file mode 100644 index 2439644d31..0000000000 --- a/SYCL/AtomicRef/max_atomic64.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "max.h" -#include -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - max_test(q, N); - - // Include long tests if they are 64 bits wide - if constexpr (sizeof(long) == 8) { - max_test(q, N); - max_test(q, N); - } - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - max_test(q, N); - max_test(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/max_orders_scopes.cpp b/SYCL/AtomicRef/max_orders_scopes.cpp deleted file mode 100644 index 0320be5969..0000000000 --- a/SYCL/AtomicRef/max_orders_scopes.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "max.h" -#include -using namespace sycl; - -template -void max_test_scopes(queue q, size_t N) { - max_test(q, N); - max_test(q, N); - max_test(q, N); - max_test(q, N); -} - -template void max_test_orders_scopes(queue q, size_t N) { - max_test_scopes(q, N); - max_test_scopes(q, N); - max_test_scopes(q, N); - max_test_scopes(q, N); -} - -int main() { - queue q; - - constexpr int N = 32; - max_test_orders_scopes(q, N); - max_test_orders_scopes(q, N); - max_test_orders_scopes(q, N); - max_test_orders_scopes(q, N); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - max_test_orders_scopes(q, N); - max_test_orders_scopes(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp index d484911d96..9678867cfa 100644 --- a/SYCL/AtomicRef/min.cpp +++ b/SYCL/AtomicRef/min.cpp @@ -1,26 +1,333 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "min.h" +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include #include + using namespace sycl; +template +void min_local_test(queue q, size_t N) { + T initial = std::numeric_limits::max(); + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = initial; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_min(T(gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to 0 + assert(cum == 0); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_min returns original value + // Intermediate values should all be <= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] <= initial); + } +} + +template +void min_global_test(queue q, size_t N) { + T initial = std::numeric_limits::max(); + T val = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), 0); + { + buffer val_buf(&val, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto val = val_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (val[0]); + out[gid] = atm.fetch_min(T(gid), order); + }); + }); + } + + // Final value should be equal to 0 + assert(val == 0); + + // Only one work-item should have received the initial value + assert(std::count(output.begin(), output.end(), initial) == 1); + + // fetch_min returns original value + // Intermediate values should all be <= initial value + for (int i = 0; i < N; ++i) { + assert(output[i] <= initial); + } +} + +template +void min_test(queue q, size_t N) { + min_local_test(q, N); + min_global_test(q, N); +} + +template +void min_test_scopes(queue q, size_t N) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test(q,N); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test(q,N); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test(q,N); +#else + min_test(q,N); +#endif +} + +template void min_test_orders_scopes(queue q, size_t N) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test_scopes(q,N); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test_scopes(q,N); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + min_test_scopes(q,N); +#else + min_test_scopes(q,N); +#endif +} + int main() { queue q; constexpr int N = 32; - min_test(q, N); - min_test(q, N); - min_test(q, N); - - // Include long tests if they are 32 bits wide +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + min_test_orders_scopes(q, N); + if constexpr (sizeof(long) == 8) { + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + } + if constexpr (sizeof(long long) == 8) { + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + } +#else + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); if constexpr (sizeof(long) == 4) { - min_test(q, N); - min_test(q, N); + min_test_orders_scopes(q, N); + min_test_orders_scopes(q, N); } +#endif std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h deleted file mode 100644 index ac5d32bebf..0000000000 --- a/SYCL/AtomicRef/min.h +++ /dev/null @@ -1,105 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void min_local_test(queue q, size_t N) { - T initial = std::numeric_limits::max(); - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = initial; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.fetch_min(T(gid), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Final value should be equal to 0 - assert(cum == 0); - - // Only one work-item should have received the initial value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // fetch_min returns original value - // Intermediate values should all be <= initial value - for (int i = 0; i < N; ++i) { - assert(output[i] <= initial); - } -} - -template -void min_global_test(queue q, size_t N) { - T initial = std::numeric_limits::max(); - T val = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), 0); - { - buffer val_buf(&val, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto val = val_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - int gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (val[0]); - out[gid] = atm.fetch_min(T(gid), order); - }); - }); - } - - // Final value should be equal to 0 - assert(val == 0); - - // Only one work-item should have received the initial value - assert(std::count(output.begin(), output.end(), initial) == 1); - - // fetch_min returns original value - // Intermediate values should all be <= initial value - for (int i = 0; i < N; ++i) { - assert(output[i] <= initial); - } -} - -template -void min_test(queue q, size_t N) { - min_local_test(q, N); - min_global_test(q, N); -} \ No newline at end of file diff --git a/SYCL/AtomicRef/min_atomic64.cpp b/SYCL/AtomicRef/min_atomic64.cpp deleted file mode 100644 index fc21b63d4e..0000000000 --- a/SYCL/AtomicRef/min_atomic64.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "min.h" -#include -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - min_test(q, N); - - // Include long tests if they are 64 bits wide - if constexpr (sizeof(long) == 8) { - min_test(q, N); - min_test(q, N); - } - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - min_test(q, N); - min_test(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/min_orders_scopes.cpp b/SYCL/AtomicRef/min_orders_scopes.cpp deleted file mode 100644 index fdf7c620a5..0000000000 --- a/SYCL/AtomicRef/min_orders_scopes.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "min.h" -#include -using namespace sycl; - -template -void min_test_scopes(queue q, size_t N) { - min_test(q, N); - min_test(q, N); - min_test(q, N); - min_test(q, N); -} - -template void min_test_orders_scopes(queue q, size_t N) { - min_test_scopes(q, N); - min_test_scopes(q, N); - min_test_scopes(q, N); - min_test_scopes(q, N); -} - -int main() { - queue q; - - constexpr int N = 32; - min_test_orders_scopes(q, N); - min_test_orders_scopes(q, N); - min_test_orders_scopes(q, N); - min_test_orders_scopes(q, N); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - min_test_orders_scopes(q, N); - min_test_orders_scopes(q, N); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp new file mode 100644 index 0000000000..d6dea3e36d --- /dev/null +++ b/SYCL/AtomicRef/or.cpp @@ -0,0 +1,322 @@ +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template +void or_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_or(T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each work-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void or_global_test(queue q) { + const size_t N = 32; + const T initial = 0; + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_or(T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each work-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void or_test(queue q) { + or_local_test(q); + or_global_test(q); +} + +template +void or_test_scopes(queue q) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test(q); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test(q); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test(q); +#else + or_test(q); +#endif +} + +template void or_test_orders_scopes(queue q) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test_scopes(q); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test_scopes(q); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + or_test_scopes(q); +#else + or_test_scopes(q); +#endif +} + +int main() { + queue q; + + constexpr int N = 32; +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + if constexpr (sizeof(long) == 8) { + or_test_orders_scopes(q); + or_test_orders_scopes(q); + } + if constexpr (sizeof(long long) == 8) { + or_test_orders_scopes(q); + or_test_orders_scopes(q); + } +#else + or_test_orders_scopes(q); + or_test_orders_scopes(q); + if constexpr (sizeof(long) == 4) { + or_test_orders_scopes(q); + or_test_orders_scopes(q); + } +#endif + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h deleted file mode 100644 index fa28b2f257..0000000000 --- a/SYCL/AtomicRef/or.h +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void or_local_test(queue q) { - const size_t N = 32; - T cum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = 0; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.fetch_or(T(1ll << gid), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Final value should be equal to N ones - assert(cum == T((1ll << N) - 1)); - - // All other values should be unique; each work-item sets one bit to 1 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void or_test(queue q) { - const size_t N = 32; - const T initial = 0; - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (cum[0]); - out[gid] = atm.fetch_or(T(1ll << gid), order); - }); - }); - } - - // Final value should be equal to N ones - assert(cum == T((1ll << N) - 1)); - - // All other values should be unique; each work-item sets one bit to 1 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} diff --git a/SYCL/AtomicRef/or_orders_scopes.cpp b/SYCL/AtomicRef/or_orders_scopes.cpp deleted file mode 100644 index 2adad06a45..0000000000 --- a/SYCL/AtomicRef/or_orders_scopes.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "or.h" -#include -using namespace sycl; - -template -void or_test_scopes(queue q) { - or_test(q); - or_test(q); - or_test(q); - or_test(q); -} - -template void or_test_orders_scopes(queue q) { - or_test_scopes(q); - or_test_scopes(q); - or_test_scopes(q); - or_test_scopes(q); -} - -int main() { - queue q; - - constexpr int N = 32; - or_test_orders_scopes(q); - or_test_orders_scopes(q); - or_test_orders_scopes(q); - or_test_orders_scopes(q); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - or_test_orders_scopes(q); - or_test_orders_scopes(q); - } - - std::cout << "Test passed." << std::endl; -} diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp new file mode 100644 index 0000000000..d600ca10d9 --- /dev/null +++ b/SYCL/AtomicRef/xor.cpp @@ -0,0 +1,322 @@ +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. +// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64 +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +template +void xor_local_test(queue q) { + const size_t N = 32; + T cum = 0; + std::vector output(N); + std::fill(output.begin(), output.end(), T(123456)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + accessor loc(1, + cgh); + + cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + int gid = it.get_global_id(0); + if (gid == 0) + loc[0] = 0; + it.barrier(access::fence_space::local_space); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::local_space > (loc[0]); + out[gid] = atm.fetch_xor(T(1ll << gid), order); + it.barrier(access::fence_space::local_space); + if (gid == 0) + cum[0] = loc[0]; + }); + }).wait_and_throw(); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each wxork-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void xor_global_test(queue q) { + const size_t N = 32; + const T initial = 0; + T cum = initial; + std::vector output(N); + std::fill(output.begin(), output.end(), T(0)); + { + buffer cum_buf(&cum, 1); + buffer output_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto cum = cum_buf.template get_access(cgh); + auto out = + output_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + size_t gid = it.get_id(0); + auto atm = atomic_ref < T, + (order == memory_order::acquire || order == memory_order::release) + ? memory_order::relaxed + : order, + scope, access::address_space::global_space > (cum[0]); + out[gid] = atm.fetch_xor(T(1ll << gid), order); + }); + }); + } + + // Final value should be equal to N ones + assert(cum == T((1ll << N) - 1)); + + // All other values should be unique; each wxork-item sets one bit to 1 + std::sort(output.begin(), output.end()); + assert(std::unique(output.begin(), output.end()) == output.end()); +} + +template +void xor_test(queue q) { + xor_local_test(q); + xor_global_test(q); +} + +template +void xor_test_scopes(queue q) { + std::vector scopes = q.get_device().get_info(); +#if defined(SYSTEM) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test(q); +#elif defined(WORK_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test(q); +#elif defined(SUB_GROUP) + if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test(q); +#else + xor_test(q); +#endif +} + +template void xor_test_orders_scopes(queue q) { + std::vector orders = q.get_device().get_info(); +#if defined(ACQ_REL) + if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test_scopes(q); +#elif defined(ACQUIRE) + if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test_scopes(q); +#elif defined(RELEASE) + if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + std::cout << "Skipping test\n"; + return; + } + xor_test_scopes(q); +#else + xor_test_scopes(q); +#endif +} + +int main() { + queue q; + + constexpr int N = 32; +#ifdef ATOMIC64 + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + if constexpr (sizeof(long) == 8) { + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + } + if constexpr (sizeof(long long) == 8) { + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + } +#else + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + if constexpr (sizeof(long) == 4) { + xor_test_orders_scopes(q); + xor_test_orders_scopes(q); + } +#endif + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h deleted file mode 100644 index d928d81453..0000000000 --- a/SYCL/AtomicRef/xor.h +++ /dev/null @@ -1,96 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; - -template -void xor_local_test(queue q) { - const size_t N = 32; - T cum = 0; - std::vector output(N); - std::fill(output.begin(), output.end(), T(123456)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - accessor loc(1, - cgh); - - cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { - int gid = it.get_global_id(0); - if (gid == 0) - loc[0] = 0; - it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::local_space > (loc[0]); - out[gid] = atm.fetch_xor(T(1ll << gid), order); - it.barrier(access::fence_space::local_space); - if (gid == 0) - cum[0] = loc[0]; - }); - }).wait_and_throw(); - } - - // Final value should be equal to N ones - assert(cum == T((1ll << N) - 1)); - - // All other values should be unique; each wxork-item sets one bit to 1 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void xor_global_test(queue q) { - const size_t N = 32; - const T initial = 0; - T cum = initial; - std::vector output(N); - std::fill(output.begin(), output.end(), T(0)); - { - buffer cum_buf(&cum, 1); - buffer output_buf(output.data(), output.size()); - - q.submit([&](handler &cgh) { - auto cum = cum_buf.template get_access(cgh); - auto out = - output_buf.template get_access(cgh); - cgh.parallel_for(range<1>(N), [=](item<1> it) { - size_t gid = it.get_id(0); - auto atm = atomic_ref < T, - (order == memory_order::acquire || order == memory_order::release) - ? memory_order::relaxed - : order, - scope, access::address_space::global_space > (cum[0]); - out[gid] = atm.fetch_xor(T(1ll << gid), order); - }); - }); - } - - // Final value should be equal to N ones - assert(cum == T((1ll << N) - 1)); - - // All other values should be unique; each wxork-item sets one bit to 1 - std::sort(output.begin(), output.end()); - assert(std::unique(output.begin(), output.end()) == output.end()); -} - -template -void xor_test(queue q) { - xor_local_test(q); - xor_global_test(q); -} \ No newline at end of file diff --git a/SYCL/AtomicRef/xor_orders_scopes.cpp b/SYCL/AtomicRef/xor_orders_scopes.cpp deleted file mode 100644 index 3bd70c0550..0000000000 --- a/SYCL/AtomicRef/xor_orders_scopes.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -// RUN: %HOST_RUN_PLACEHOLDER %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include "xor.h" -#include -using namespace sycl; - -template -void xor_test_scopes(queue q) { - xor_test(q); - xor_test(q); - xor_test(q); - xor_test(q); -} - -template void xor_test_orders_scopes(queue q) { - xor_test_scopes(q); - xor_test_scopes(q); - xor_test_scopes(q); - xor_test_scopes(q); -} - -int main() { - queue q; - - constexpr int N = 32; - xor_test_orders_scopes(q); - xor_test_orders_scopes(q); - xor_test_orders_scopes(q); - xor_test_orders_scopes(q); - - // Include long long tests if they are 64 bits wide - if constexpr (sizeof(long long) == 8) { - xor_test_orders_scopes(q); - xor_test_orders_scopes(q); - } - - std::cout << "Test passed." << std::endl; -} From 0375249316941a74f74ca5fd0eb6cf7f8e71fb56 Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Fri, 19 Nov 2021 13:30:52 +0100 Subject: [PATCH 04/27] [SYCL] format --- SYCL/AtomicRef/add.cpp | 59 +++++++++++++++++------------ SYCL/AtomicRef/and.cpp | 36 +++++++++++------- SYCL/AtomicRef/compare_exchange.cpp | 48 +++++++++++++---------- SYCL/AtomicRef/exchange.cpp | 40 +++++++++++-------- SYCL/AtomicRef/max.cpp | 50 ++++++++++++++---------- SYCL/AtomicRef/min.cpp | 50 ++++++++++++++---------- SYCL/AtomicRef/or.cpp | 34 +++++++++++------ SYCL/AtomicRef/xor.cpp | 34 +++++++++++------ 8 files changed, 215 insertions(+), 136 deletions(-) diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index af0db82b8c..bd6d538b65 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -150,10 +152,10 @@ #include #include #include +#include #include #include #include -#include using namespace sycl; @@ -377,56 +379,64 @@ void add_test(queue q, size_t N) { } } -template +template void add_test_scopes(queue q, size_t N) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - add_test(q,N); + add_test(q, N); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - add_test(q,N); + add_test(q, N); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - add_test(q,N); + add_test(q, N); #else - add_test(q,N); + add_test(q, N); #endif } template -void - add_test_orders_scopes(queue q, size_t N) { - std::vector orders = q.get_device().get_info(); +void add_test_orders_scopes(queue q, size_t N) { + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } - add_test_scopes(q,N); + add_test_scopes(q, N); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } - add_test_scopes(q,N); + add_test_scopes(q, N); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } - add_test_scopes(q,N); + add_test_scopes(q, N); #else - add_test_scopes(q,N); + add_test_scopes(q, N); #endif } @@ -439,7 +449,7 @@ int main() { std::cout << "Skipping test\n"; return 0; } - + add_test_orders_scopes(q, N); if constexpr (sizeof(long) == 8) { add_test_orders_scopes(q, N); @@ -464,7 +474,6 @@ int main() { add_test_orders_scopes(q, N); } #endif - + std::cout << "Test passed." << std::endl; } - diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp index 89355aa9e4..d64353bb2e 100644 --- a/SYCL/AtomicRef/and.cpp +++ b/SYCL/AtomicRef/and.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -242,21 +244,25 @@ void and_test(queue q) { template void and_test_scopes(queue q) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } and_test(q); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } and_test(q); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } @@ -267,21 +273,25 @@ void and_test_scopes(queue q) { } template void and_test_orders_scopes(queue q) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } and_test_scopes(q); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } and_test_scopes(q); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } @@ -293,13 +303,13 @@ template void and_test_orders_scopes(queue q) { int main() { queue q; - + #ifdef ATOMIC64 if (!q.get_device().has(aspect::atomic64)) { std::cout << "Skipping test\n"; return 0; } - + if constexpr (sizeof(long) == 8) { and_test_orders_scopes(q); and_test_orders_scopes(q); diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp index 3df693f173..b85f65e299 100644 --- a/SYCL/AtomicRef/compare_exchange.cpp +++ b/SYCL/AtomicRef/compare_exchange.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -259,53 +261,61 @@ void compare_exchange_test(queue q, size_t N) { template void compare_exchange_test_scopes(queue q, size_t N) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test(q,N); + compare_exchange_test(q, N); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test(q,N); + compare_exchange_test(q, N); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test(q,N); + compare_exchange_test(q, N); #else - compare_exchange_test(q,N); + compare_exchange_test(q, N); #endif } template void compare_exchange_test_orders_scopes(queue q, size_t N) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test_scopes(q,N); + compare_exchange_test_scopes(q, N); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test_scopes(q,N); + compare_exchange_test_scopes(q, N); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } - compare_exchange_test_scopes(q,N); + compare_exchange_test_scopes(q, N); #else - compare_exchange_test_scopes(q,N); + compare_exchange_test_scopes(q, N); #endif } diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp index 805c3ede18..5b208b9e7b 100644 --- a/SYCL/AtomicRef/exchange.cpp +++ b/SYCL/AtomicRef/exchange.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -244,46 +246,54 @@ void exchange_test(queue q, size_t N) { template void exchange_test_scopes(queue q, size_t N) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - exchange_test(q,N); + exchange_test(q, N); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - exchange_test(q,N); + exchange_test(q, N); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - exchange_test(q,N); + exchange_test(q, N); #else - exchange_test(q,N); + exchange_test(q, N); #endif } template void exchange_test_orders_scopes(queue q, size_t N) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } exchange_test_scopes(q, N); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } exchange_test_scopes(q, N); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp index 81f1947fc9..dfc5696ba9 100644 --- a/SYCL/AtomicRef/max.cpp +++ b/SYCL/AtomicRef/max.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -253,52 +255,60 @@ void max_test(queue q, size_t N) { template void max_test_scopes(queue q, size_t N) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - max_test(q,N); + max_test(q, N); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - max_test(q,N); + max_test(q, N); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - max_test(q,N); + max_test(q, N); #else - max_test(q,N); + max_test(q, N); #endif } template void max_test_orders_scopes(queue q, size_t N) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } - max_test_scopes(q,N); + max_test_scopes(q, N); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } - max_test_scopes(q,N); + max_test_scopes(q, N); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } - max_test_scopes(q,N); + max_test_scopes(q, N); #else - max_test_scopes(q,N); + max_test_scopes(q, N); #endif } @@ -311,7 +321,7 @@ int main() { std::cout << "Skipping test\n"; return 0; } - + max_test_orders_scopes(q, N); if constexpr (sizeof(long) == 8) { max_test_orders_scopes(q, N); diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp index 9678867cfa..c5ffef16cf 100644 --- a/SYCL/AtomicRef/min.cpp +++ b/SYCL/AtomicRef/min.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -251,52 +253,60 @@ void min_test(queue q, size_t N) { template void min_test_scopes(queue q, size_t N) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - min_test(q,N); + min_test(q, N); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - min_test(q,N); + min_test(q, N); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } - min_test(q,N); + min_test(q, N); #else - min_test(q,N); + min_test(q, N); #endif } template void min_test_orders_scopes(queue q, size_t N) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } - min_test_scopes(q,N); + min_test_scopes(q, N); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } - min_test_scopes(q,N); + min_test_scopes(q, N); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } - min_test_scopes(q,N); + min_test_scopes(q, N); #else - min_test_scopes(q,N); + min_test_scopes(q, N); #endif } @@ -309,7 +319,7 @@ int main() { std::cout << "Skipping test\n"; return 0; } - + min_test_orders_scopes(q, N); if constexpr (sizeof(long) == 8) { min_test_orders_scopes(q, N); diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp index d6dea3e36d..651c4d8d8a 100644 --- a/SYCL/AtomicRef/or.cpp +++ b/SYCL/AtomicRef/or.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -242,21 +244,25 @@ void or_test(queue q) { template void or_test_scopes(queue q) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } or_test(q); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } or_test(q); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } @@ -267,21 +273,25 @@ void or_test_scopes(queue q) { } template void or_test_orders_scopes(queue q) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } or_test_scopes(q); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } or_test_scopes(q); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } @@ -300,7 +310,7 @@ int main() { std::cout << "Skipping test\n"; return 0; } - + if constexpr (sizeof(long) == 8) { or_test_orders_scopes(q); or_test_orders_scopes(q); diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp index d600ca10d9..da78d49a06 100644 --- a/SYCL/AtomicRef/xor.cpp +++ b/SYCL/AtomicRef/xor.cpp @@ -1,5 +1,7 @@ -// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately. -// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device. +// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel +// semantic order and sub_group/work_group/device/system scope is tested +// separately. This is controlled by macros, defined by RUN commands. Defaults +// (no macro for a group) are: 32 bit, relaxed and device. // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 // RUN: %HOST_RUN_PLACEHOLDER %t.out @@ -148,9 +150,9 @@ #include #include #include +#include #include #include -#include using namespace sycl; @@ -242,21 +244,25 @@ void xor_test(queue q) { template void xor_test_scopes(queue q) { - std::vector scopes = q.get_device().get_info(); + std::vector scopes = + q.get_device().get_info(); #if defined(SYSTEM) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } xor_test(q); #elif defined(WORK_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } xor_test(q); #elif defined(SUB_GROUP) - if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){ + if (std::find(scopes.begin(), scopes.end(), memory_scope::system) == + scopes.end()) { std::cout << "Skipping test\n"; return; } @@ -267,21 +273,25 @@ void xor_test_scopes(queue q) { } template void xor_test_orders_scopes(queue q) { - std::vector orders = q.get_device().get_info(); + std::vector orders = + q.get_device().get_info(); #if defined(ACQ_REL) - if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) == + orders.end()) { std::cout << "Skipping test\n"; return; } xor_test_scopes(q); #elif defined(ACQUIRE) - if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::acquire) == + orders.end()) { std::cout << "Skipping test\n"; return; } xor_test_scopes(q); #elif defined(RELEASE) - if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){ + if (std::find(orders.begin(), orders.end(), memory_order::release) == + orders.end()) { std::cout << "Skipping test\n"; return; } @@ -300,7 +310,7 @@ int main() { std::cout << "Skipping test\n"; return 0; } - + if constexpr (sizeof(long) == 8) { xor_test_orders_scopes(q); xor_test_orders_scopes(q); From c215e68b4000b7cc9ae27c8853a4a51c2a54dcaa Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Fri, 19 Nov 2021 13:39:27 +0100 Subject: [PATCH 05/27] [SYCL] add testing for both AtomicRef implementations --- SYCL/AtomicRef/add.cpp | 40 ++++++++++++++++++----------- SYCL/AtomicRef/and.cpp | 16 +++++++----- SYCL/AtomicRef/compare_exchange.cpp | 16 +++++++----- SYCL/AtomicRef/exchange.cpp | 16 +++++++----- SYCL/AtomicRef/max.cpp | 16 +++++++----- SYCL/AtomicRef/min.cpp | 16 +++++++----- SYCL/AtomicRef/or.cpp | 16 +++++++----- SYCL/AtomicRef/xor.cpp | 16 +++++++----- 8 files changed, 95 insertions(+), 57 deletions(-) diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index af0db82b8c..b6fd9e3e7c 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -157,7 +157,8 @@ using namespace sycl; -template + class AtomicRef, typename T, typename Difference = T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void add_fetch_local_test(queue q, size_t N) { @@ -179,7 +180,7 @@ void add_fetch_local_test(queue q, size_t N) { if (gid == 0) loc[0] = 0; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -205,7 +206,8 @@ void add_fetch_local_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, typename Difference = T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void add_fetch_test(queue q, size_t N) { @@ -221,7 +223,7 @@ void add_fetch_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -244,7 +246,8 @@ void add_fetch_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, typename Difference = T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void add_plus_equal_test(queue q, size_t N) { @@ -261,7 +264,7 @@ void add_plus_equal_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -284,7 +287,8 @@ void add_plus_equal_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, typename Difference = T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void add_pre_inc_test(queue q, size_t N) { @@ -301,7 +305,7 @@ void add_pre_inc_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -324,7 +328,8 @@ void add_pre_inc_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, typename Difference = T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void add_post_inc_test(queue q, size_t N) { @@ -341,7 +346,7 @@ void add_post_inc_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -368,12 +373,17 @@ template void add_test(queue q, size_t N) { - add_fetch_local_test(q, N); - add_fetch_test(q, N); - add_plus_equal_test(q, N); + add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N); + add_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N); + add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N); + add_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N); + add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N); + add_plus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N); if constexpr (!std::is_floating_point_v) { - add_pre_inc_test(q, N); - add_post_inc_test(q, N); + add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N); + add_pre_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N); + add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N); + add_post_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N); } } diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp index 89355aa9e4..2960d862b3 100644 --- a/SYCL/AtomicRef/and.cpp +++ b/SYCL/AtomicRef/and.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void and_local_test(queue q) { const size_t N = 32; @@ -176,7 +177,7 @@ void and_local_test(queue q) { if (gid == 0) loc[0] = T((1ll << N) - 1); it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -197,7 +198,8 @@ void and_local_test(queue q) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void and_global_test(queue q) { const size_t N = 32; @@ -215,7 +217,7 @@ void and_global_test(queue q) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -236,8 +238,10 @@ void and_global_test(queue q) { template void and_test(queue q) { - and_local_test(q); - and_global_test(q); + and_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + and_local_test<::sycl::atomic_ref, T, order, scope>(q); + and_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + and_global_test<::sycl::atomic_ref, T, order, scope>(q); } template diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp index 3df693f173..2b04c693f3 100644 --- a/SYCL/AtomicRef/compare_exchange.cpp +++ b/SYCL/AtomicRef/compare_exchange.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void compare_exchange_local_test(queue q, size_t N) { const T initial = T(N); @@ -178,7 +179,7 @@ void compare_exchange_local_test(queue q, size_t N) { if (gid == 0) loc[0] = initial; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -206,7 +207,8 @@ void compare_exchange_local_test(queue q, size_t N) { } } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void compare_exchange_global_test(queue q, size_t N) { const T initial = T(N); @@ -225,7 +227,7 @@ void compare_exchange_global_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -253,8 +255,10 @@ void compare_exchange_global_test(queue q, size_t N) { template void compare_exchange_test(queue q, size_t N) { - compare_exchange_local_test(q, N); - compare_exchange_global_test(q, N); + compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + compare_exchange_local_test<::sycl::atomic_ref, T, order, scope>(q, N); + compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + compare_exchange_global_test<::sycl::atomic_ref, T, order, scope>(q, N); } template diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp index 805c3ede18..1a3a074d65 100644 --- a/SYCL/AtomicRef/exchange.cpp +++ b/SYCL/AtomicRef/exchange.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void exchange_local_test(queue q, size_t N) { const T initial = T(N); @@ -176,7 +177,7 @@ void exchange_local_test(queue q, size_t N) { if (gid == 0) loc[0] = initial; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -198,7 +199,8 @@ void exchange_local_test(queue q, size_t N) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void exchange_global_test(queue q, size_t N) { const T initial = T(N); @@ -216,7 +218,7 @@ void exchange_global_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -238,8 +240,10 @@ void exchange_global_test(queue q, size_t N) { template void exchange_test(queue q, size_t N) { - exchange_local_test(q, N); - exchange_global_test(q, N); + exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + exchange_local_test<::sycl::atomic_ref, T, order, scope>(q, N); + exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + exchange_global_test<::sycl::atomic_ref, T, order, scope>(q, N); } template diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp index 81f1947fc9..f6522a3a0f 100644 --- a/SYCL/AtomicRef/max.cpp +++ b/SYCL/AtomicRef/max.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void max_local_test(queue q, size_t N) { T initial = std::numeric_limits::lowest(); @@ -176,7 +177,7 @@ void max_local_test(queue q, size_t N) { if (gid == 0) loc[0] = initial; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -202,7 +203,8 @@ void max_local_test(queue q, size_t N) { } } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void max_global_test(queue q, size_t N) { T initial = std::numeric_limits::lowest(); @@ -219,7 +221,7 @@ void max_global_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -247,8 +249,10 @@ void max_global_test(queue q, size_t N) { template void max_test(queue q, size_t N) { - max_local_test(q, N); - max_global_test(q, N); + max_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + max_local_test<::sycl::atomic_ref, T, order, scope>(q, N); + max_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + max_global_test<::sycl::atomic_ref, T, order, scope>(q, N); } template diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp index 9678867cfa..18eb8dd449 100644 --- a/SYCL/AtomicRef/min.cpp +++ b/SYCL/AtomicRef/min.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void min_local_test(queue q, size_t N) { T initial = std::numeric_limits::max(); @@ -176,7 +177,7 @@ void min_local_test(queue q, size_t N) { if (gid == 0) loc[0] = initial; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -202,7 +203,8 @@ void min_local_test(queue q, size_t N) { } } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void min_global_test(queue q, size_t N) { T initial = std::numeric_limits::max(); @@ -219,7 +221,7 @@ void min_global_test(queue q, size_t N) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { int gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -245,8 +247,10 @@ void min_global_test(queue q, size_t N) { template void min_test(queue q, size_t N) { - min_local_test(q, N); - min_global_test(q, N); + min_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + min_local_test<::sycl::atomic_ref, T, order, scope>(q, N); + min_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N); + min_global_test<::sycl::atomic_ref, T, order, scope>(q, N); } template diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp index d6dea3e36d..834f3eb2d7 100644 --- a/SYCL/AtomicRef/or.cpp +++ b/SYCL/AtomicRef/or.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void or_local_test(queue q) { const size_t N = 32; @@ -176,7 +177,7 @@ void or_local_test(queue q) { if (gid == 0) loc[0] = 0; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -197,7 +198,8 @@ void or_local_test(queue q) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void or_global_test(queue q) { const size_t N = 32; @@ -215,7 +217,7 @@ void or_global_test(queue q) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -236,8 +238,10 @@ void or_global_test(queue q) { template void or_test(queue q) { - or_local_test(q); - or_global_test(q); + or_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + or_local_test<::sycl::atomic_ref, T, order, scope>(q); + or_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + or_global_test<::sycl::atomic_ref, T, order, scope>(q); } template diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp index d600ca10d9..ae577f290e 100644 --- a/SYCL/AtomicRef/xor.cpp +++ b/SYCL/AtomicRef/xor.cpp @@ -154,7 +154,8 @@ using namespace sycl; -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void xor_local_test(queue q) { const size_t N = 32; @@ -176,7 +177,7 @@ void xor_local_test(queue q) { if (gid == 0) loc[0] = 0; it.barrier(access::fence_space::local_space); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -197,7 +198,8 @@ void xor_local_test(queue q) { assert(std::unique(output.begin(), output.end()) == output.end()); } -template + class AtomicRef, typename T, memory_order order = memory_order::relaxed, memory_scope scope = memory_scope::device> void xor_global_test(queue q) { const size_t N = 32; @@ -215,7 +217,7 @@ void xor_global_test(queue q) { output_buf.template get_access(cgh); cgh.parallel_for(range<1>(N), [=](item<1> it) { size_t gid = it.get_id(0); - auto atm = atomic_ref < T, + auto atm = AtomicRef < T, (order == memory_order::acquire || order == memory_order::release) ? memory_order::relaxed : order, @@ -236,8 +238,10 @@ void xor_global_test(queue q) { template void xor_test(queue q) { - xor_local_test(q); - xor_global_test(q); + xor_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + xor_local_test<::sycl::atomic_ref, T, order, scope>(q); + xor_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q); + xor_global_test<::sycl::atomic_ref, T, order, scope>(q); } template From d18ca3411cd774f77575d1ec365eeb9bfb1a950e Mon Sep 17 00:00:00 2001 From: Tadej Ciglaric Date: Fri, 19 Nov 2021 13:39:55 +0100 Subject: [PATCH 06/27] [SYCL] format --- SYCL/AtomicRef/add.cpp | 30 +++++++++++++++++++---------- SYCL/AtomicRef/and.cpp | 6 ++++-- SYCL/AtomicRef/compare_exchange.cpp | 12 ++++++++---- SYCL/AtomicRef/exchange.cpp | 6 ++++-- SYCL/AtomicRef/max.cpp | 6 ++++-- SYCL/AtomicRef/min.cpp | 6 ++++-- SYCL/AtomicRef/or.cpp | 6 ++++-- SYCL/AtomicRef/xor.cpp | 6 ++++-- 8 files changed, 52 insertions(+), 26 deletions(-) diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp index b6ede2eba5..2bacd90a9e 100644 --- a/SYCL/AtomicRef/add.cpp +++ b/SYCL/AtomicRef/add.cpp @@ -160,7 +160,8 @@ using namespace sycl; template