From 626033377be3028cb864b62d2fdb3e248e9d1759 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Tue, 26 Oct 2021 15:18:55 +0200
Subject: [PATCH 01/27] [SYCL] Added tests for atomics with various memory
 orders and scopes

---
 SYCL/AtomicRef/add.cpp                        |   6 -
 SYCL/AtomicRef/add.h                          | 127 ++++++++++++++----
 SYCL/AtomicRef/add_atomic64.cpp               |   6 -
 SYCL/AtomicRef/add_orders_scopes.cpp          |  49 +++++++
 SYCL/AtomicRef/and.h                          |  96 +++++++++++++
 SYCL/AtomicRef/and_orders_scopes.cpp          |  42 ++++++
 SYCL/AtomicRef/compare_exchange.h             | 104 +++++++++++---
 .../compare_exchange_orders_scopes.cpp        |  46 +++++++
 SYCL/AtomicRef/exchange.h                     |  66 ++++++++-
 SYCL/AtomicRef/exchange_orders_scopes.cpp     |  48 +++++++
 SYCL/AtomicRef/max.h                          |  74 +++++++++-
 SYCL/AtomicRef/max_orders_scopes.cpp          |  42 ++++++
 SYCL/AtomicRef/min.h                          |  68 +++++++++-
 SYCL/AtomicRef/min_orders_scopes.cpp          |  42 ++++++
 SYCL/AtomicRef/or.h                           |  89 ++++++++++++
 SYCL/AtomicRef/or_orders_scopes.cpp           |  42 ++++++
 SYCL/AtomicRef/xor.h                          |  96 +++++++++++++
 SYCL/AtomicRef/xor_orders_scopes.cpp          |  42 ++++++
 SYCL/Reduction/reduction_range_1d_s0_dw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_1d_s0_rw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_1d_s1_dw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_1d_s1_rw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_2d_s1_dw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_2d_s1_rw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_3d_s1_dw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_3d_s1_rw.cpp   |   2 +-
 SYCL/Reduction/reduction_range_usm_dw.cpp     |   2 +-
 27 files changed, 1016 insertions(+), 87 deletions(-)
 create mode 100644 SYCL/AtomicRef/add_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/and.h
 create mode 100644 SYCL/AtomicRef/and_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/exchange_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/max_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/min_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/or.h
 create mode 100644 SYCL/AtomicRef/or_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/xor.h
 create mode 100644 SYCL/AtomicRef/xor_orders_scopes.cpp
diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index e2687fc841..64fc4fcd79 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -9,12 +9,6 @@
 #include <iostream>
 using namespace sycl;
 
-// Floating-point types do not support pre- or post-increment
-template <> void add_test<float>(queue q, size_t N) {
-  add_fetch_test<float>(q, N);
-  add_plus_equal_test<float>(q, N);
-}
-
 int main() {
   queue q;
 
diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
index 99f4780ce9..5a1bfd06d2 100644
--- a/SYCL/AtomicRef/add.h
+++ b/SYCL/AtomicRef/add.h
@@ -4,12 +4,63 @@
 #include <algorithm>
 #include <cassert>
 #include <numeric>
+#include <type_traits>
 #include <vector>
 
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T, typename Difference = T>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_fetch_local_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_add(Difference(1), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           sum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Fetch returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
 void add_fetch_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
@@ -17,18 +68,20 @@ void add_fetch_test(queue q, size_t N) {
   {
     buffer<T> sum_buf(&sum, 1);
     buffer<T> output_buf(output.data(), output.size());
-
     q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
-        out[gid] = atm.fetch_add(Difference(1));
-      });
-    });
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       cgh.parallel_for(range<1>(N), [=](item<1> it) {
+         int gid = it.get_id(0);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::global_space > (sum[0]);
+         out[gid] = atm.fetch_add(Difference(1), order);
+       });
+     }).wait_and_throw();
   }
 
   // All work-items increment by 1, so final value should be equal to N
@@ -37,14 +90,16 @@ void add_fetch_test(queue q, size_t N) {
   // Fetch returns original value: will be in [0, N-1]
   auto min_e = std::min_element(output.begin(), output.end());
   auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
+  assert(*min_e == 0 && *max_e == T(N - 1));
 
   // Intermediate values should be unique
   std::sort(output.begin(), output.end());
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
 void add_plus_equal_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
@@ -59,8 +114,11 @@ void add_plus_equal_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
         out[gid] = atm += Difference(1);
       });
     });
@@ -79,7 +137,9 @@ void add_plus_equal_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
 void add_pre_inc_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
@@ -94,8 +154,11 @@ void add_pre_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
         out[gid] = ++atm;
       });
     });
@@ -114,7 +177,9 @@ void add_pre_inc_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
 void add_post_inc_test(queue q, size_t N) {
   T sum = 0;
   std::vector<T> output(N);
@@ -129,8 +194,11 @@ void add_post_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(sum[0]);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
         out[gid] = atm++;
       });
     });
@@ -149,10 +217,15 @@ void add_post_inc_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
 void add_test(queue q, size_t N) {
-  add_fetch_test<T, Difference>(q, N);
-  add_plus_equal_test<T, Difference>(q, N);
-  add_pre_inc_test<T, Difference>(q, N);
-  add_post_inc_test<T, Difference>(q, N);
+  add_fetch_local_test<T, Difference, order, scope>(q, N);
+  add_fetch_test<T, Difference, order, scope>(q, N);
+  add_plus_equal_test<T, Difference, order, scope>(q, N);
+  if constexpr (!std::is_floating_point_v<T>) {
+    add_pre_inc_test<T, Difference, order, scope>(q, N);
+    add_post_inc_test<T, Difference, order, scope>(q, N);
+  }
 }
diff --git a/SYCL/AtomicRef/add_atomic64.cpp b/SYCL/AtomicRef/add_atomic64.cpp
index 6059a7cd7f..13d47a406a 100644
--- a/SYCL/AtomicRef/add_atomic64.cpp
+++ b/SYCL/AtomicRef/add_atomic64.cpp
@@ -9,12 +9,6 @@
 #include <iostream>
 using namespace sycl;
 
-// Floating-point types do not support pre- or post-increment
-template <> void add_test<double>(queue q, size_t N) {
-  add_fetch_test<double>(q, N);
-  add_plus_equal_test<double>(q, N);
-}
-
 int main() {
   queue q;
 
diff --git a/SYCL/AtomicRef/add_orders_scopes.cpp b/SYCL/AtomicRef/add_orders_scopes.cpp
new file mode 100644
index 0000000000..6be7c66186
--- /dev/null
+++ b/SYCL/AtomicRef/add_orders_scopes.cpp
@@ -0,0 +1,49 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
+// RUN: -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "add.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed>
+void add_test_scopes(queue q, size_t N) {
+  add_test<T, Difference, order, memory_scope::system>(q, N);
+  add_test<T, Difference, order, memory_scope::device>(q, N);
+  add_test<T, Difference, order, memory_scope::work_group>(q, N);
+  add_test<T, Difference, order, memory_scope::sub_group>(q, N);
+}
+
+template <typename T, typename Difference = T>
+void add_test_orders_scopes(queue q, size_t N) {
+  add_test_scopes<T, Difference, memory_order::relaxed>(q, N);
+  add_test_scopes<T, Difference, memory_order::acquire>(q, N);
+  add_test_scopes<T, Difference, memory_order::release>(q, N);
+  add_test_scopes<T, Difference, memory_order::acq_rel>(q, N);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  add_test_orders_scopes<int>(q, N);
+  add_test_orders_scopes<float>(q, N);
+  add_test_orders_scopes<unsigned int>(q, N);
+  add_test_orders_scopes<double>(q, N);
+  add_test_orders_scopes<long>(q, N);
+  add_test_orders_scopes<unsigned long>(q, N);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    add_test_orders_scopes<long long>(q, N);
+    add_test_orders_scopes<unsigned long long>(q, N);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h
new file mode 100644
index 0000000000..3d5e7db2cf
--- /dev/null
+++ b/SYCL/AtomicRef/and.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = T((1ll << N) - 1);
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_and(~T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // All other values should be unique; each work-item sets one bit to 0
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_global_test(queue q) {
+  const size_t N = 32;
+  const T initial = T((1ll << N) - 1);
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_and(~T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // All other values should be unique; each work-item sets one bit to 0
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_test(queue q) {
+  and_local_test<T, order, scope>(q);
+  and_global_test<T, order, scope>(q);
+}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/and_orders_scopes.cpp b/SYCL/AtomicRef/and_orders_scopes.cpp
new file mode 100644
index 0000000000..1b020e6176
--- /dev/null
+++ b/SYCL/AtomicRef/and_orders_scopes.cpp
@@ -0,0 +1,42 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "and.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void and_test_scopes(queue q) {
+  and_test<T, order, memory_scope::system>(q);
+  and_test<T, order, memory_scope::device>(q);
+  and_test<T, order, memory_scope::work_group>(q);
+  and_test<T, order, memory_scope::sub_group>(q);
+}
+
+template <typename T> void and_test_orders_scopes(queue q) {
+  and_test_scopes<T, memory_order::relaxed>(q);
+  and_test_scopes<T, memory_order::acquire>(q);
+  and_test_scopes<T, memory_order::release>(q);
+  and_test_scopes<T, memory_order::acq_rel>(q);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  and_test_orders_scopes<int>(q);
+  and_test_orders_scopes<unsigned int>(q);
+  and_test_orders_scopes<long>(q);
+  and_test_orders_scopes<unsigned long>(q);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    and_test_orders_scopes<long long>(q);
+    and_test_orders_scopes<unsigned long long>(q);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h
index 04da52b81f..72107c8b18 100644
--- a/SYCL/AtomicRef/compare_exchange.h
+++ b/SYCL/AtomicRef/compare_exchange.h
@@ -9,9 +9,61 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T> class compare_exchange_kernel;
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_local_test(queue q, size_t N) {
+  const T initial = T(N);
+  T compare_exchange = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> compare_exchange_buf(&compare_exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto compare_exchange =
+           compare_exchange_buf.template get_access<access::mode::read_write>(
+               cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         T result = T(N); // Avoid copying pointer
+         bool success = atm.compare_exchange_strong(result, (T)gid, order);
+         if (success) {
+           out[gid] = result;
+         } else {
+           out[gid] = T(gid);
+         }
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           compare_exchange[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
 
-template <typename T> void compare_exchange_test(queue q, size_t N) {
+  // All other values should be the index itself or the sentinel value
+  for (size_t i = 0; i < N; ++i) {
+    assert(output[i] == T(i) || output[i] == initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
   T compare_exchange = initial;
   std::vector<T> output(N);
@@ -21,26 +73,27 @@ template <typename T> void compare_exchange_test(queue q, size_t N) {
     buffer<T> output_buf(output.data(), output.size());
 
     q.submit([&](handler &cgh) {
-      auto exc =
-          compare_exchange_buf.template get_access<access::mode::read_write>(
-              cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for<compare_exchange_kernel<T>>(
-          range<1>(N), [=](item<1> it) {
-            size_t gid = it.get_id(0);
-            auto atm =
-                atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                           access::address_space::global_space>(exc[0]);
-            T result = T(N); // Avoid copying pointer
-            bool success = atm.compare_exchange_strong(result, (T)gid);
-            if (success) {
-              out[gid] = result;
-            } else {
-              out[gid] = T(gid);
-            }
-          });
-    });
+       auto exc =
+           compare_exchange_buf.template get_access<access::mode::read_write>(
+               cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       cgh.parallel_for(range<1>(N), [=](item<1> it) {
+         size_t gid = it.get_id(0);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::global_space > (exc[0]);
+         T result = T(N); // Avoid copying pointer
+         bool success = atm.compare_exchange_strong(result, (T)gid, order);
+         if (success) {
+           out[gid] = result;
+         } else {
+           out[gid] = T(gid);
+         }
+       });
+     }).wait_and_throw();
   }
 
   // Only one work-item should have received the initial sentinel value
@@ -51,3 +104,10 @@ template <typename T> void compare_exchange_test(queue q, size_t N) {
     assert(output[i] == T(i) || output[i] == initial);
   }
 }
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_test(queue q, size_t N) {
+  compare_exchange_local_test<T, order, scope>(q, N);
+  compare_exchange_global_test<T, order, scope>(q, N);
+}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
new file mode 100644
index 0000000000..15c36c6e7c
--- /dev/null
+++ b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
@@ -0,0 +1,46 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "compare_exchange.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void compare_exchange_test_scopes(queue q, size_t N) {
+  compare_exchange_test<T, order, memory_scope::system>(q, N);
+  compare_exchange_test<T, order, memory_scope::device>(q, N);
+  compare_exchange_test<T, order, memory_scope::work_group>(q, N);
+  compare_exchange_test<T, order, memory_scope::sub_group>(q, N);
+}
+
+template <typename T>
+void compare_exchange_test_orders_scopes(queue q, size_t N) {
+  compare_exchange_test_scopes<T, memory_order::relaxed>(q, N);
+  compare_exchange_test_scopes<T, memory_order::acquire>(q, N);
+  compare_exchange_test_scopes<T, memory_order::release>(q, N);
+  compare_exchange_test_scopes<T, memory_order::acq_rel>(q, N);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  compare_exchange_test_orders_scopes<int>(q, N);
+  compare_exchange_test_orders_scopes<unsigned int>(q, N);
+  compare_exchange_test_orders_scopes<float>(q, N);
+  compare_exchange_test_orders_scopes<long>(q, N);
+  compare_exchange_test_orders_scopes<unsigned long>(q, N);
+  compare_exchange_test_orders_scopes<char *>(q, N);
+  compare_exchange_test_orders_scopes<double>(q, N);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    compare_exchange_test_orders_scopes<long long>(q, N);
+    compare_exchange_test_orders_scopes<unsigned long long>(q, N);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h
index a050ddaf4d..b8a2c7f81f 100644
--- a/SYCL/AtomicRef/exchange.h
+++ b/SYCL/AtomicRef/exchange.h
@@ -9,9 +9,53 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T> class exchange_kernel;
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_local_test(queue q, size_t N) {
+  const T initial = T(N);
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
 
-template <typename T> void exchange_test(queue q, size_t N) {
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.exchange(T(gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be unique; each work-item replaces the value it
+  // reads with its own ID
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
   T exchange = initial;
   std::vector<T> output(N);
@@ -25,11 +69,14 @@ template <typename T> void exchange_test(queue q, size_t N) {
           exchange_buf.template get_access<access::mode::read_write>(cgh);
       auto out =
           output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for<exchange_kernel<T>>(range<1>(N), [=](item<1> it) {
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(exc[0]);
-        out[gid] = atm.exchange(T(gid));
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (exc[0]);
+        out[gid] = atm.exchange(T(gid), order);
       });
     });
   }
@@ -42,3 +89,10 @@ template <typename T> void exchange_test(queue q, size_t N) {
   std::sort(output.begin(), output.end());
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_test(queue q, size_t N) {
+  exchange_local_test<T, order, scope>(q, N);
+  exchange_global_test<T, order, scope>(q, N);
+}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/exchange_orders_scopes.cpp b/SYCL/AtomicRef/exchange_orders_scopes.cpp
new file mode 100644
index 0000000000..d3a9ee2c1d
--- /dev/null
+++ b/SYCL/AtomicRef/exchange_orders_scopes.cpp
@@ -0,0 +1,48 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "exchange.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void exchange_test_scopes(queue q, size_t N) {
+  exchange_test<T, order, memory_scope::system>(q, N);
+  exchange_test<T, order, memory_scope::device>(q, N);
+  exchange_test<T, order, memory_scope::work_group>(q, N);
+  exchange_test<T, order, memory_scope::sub_group>(q, N);
+}
+
+template <typename T> void exchange_test_orders_scopes(queue q, size_t N) {
+  exchange_test_scopes<T, memory_order::relaxed>(q, N);
+  exchange_test_scopes<T, memory_order::acquire>(q, N);
+  exchange_test_scopes<T, memory_order::release>(q, N);
+  exchange_test_scopes<T, memory_order::acq_rel>(q, N);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  exchange_test_orders_scopes<int>(q, N);
+  exchange_test_orders_scopes<unsigned int>(q, N);
+  exchange_test_orders_scopes<float>(q, N);
+
+  exchange_test_orders_scopes<long>(q, N);
+  exchange_test_orders_scopes<unsigned long>(q, N);
+
+  exchange_test_orders_scopes<char *>(q, N);
+
+  exchange_test_orders_scopes<double>(q, N);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    exchange_test_orders_scopes<long long>(q, N);
+    exchange_test_orders_scopes<unsigned long long>(q, N);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
index 4da562e8e4..d284a60096 100644
--- a/SYCL/AtomicRef/max.h
+++ b/SYCL/AtomicRef/max.h
@@ -9,7 +9,57 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T> void max_test(queue q, size_t N) {
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_local_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::lowest();
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] =
+             atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  assert(cum == N - 1 + std::numeric_limits<T>::max() / 2);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_max returns original value
+  // Intermediate values should all be >= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] >= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
   T val = initial;
   std::vector<T> output(N);
@@ -24,17 +74,20 @@ template <typename T> void max_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(val[0]);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (val[0]);
 
-        // +1 accounts for lowest() returning 0 for unsigned types
-        out[gid] = atm.fetch_max(T(gid) + 1);
+        // +max/2 to ensure correct signed/unsigned operation is applied
+        out[gid] =
+            atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
       });
     });
   }
 
-  // Final value should be equal to N
-  assert(val == N);
+  assert(val == N - 1 + std::numeric_limits<T>::max() / 2);
 
   // Only one work-item should have received the initial value
   assert(std::count(output.begin(), output.end(), initial) == 1);
@@ -45,3 +98,10 @@ template <typename T> void max_test(queue q, size_t N) {
     assert(output[i] >= initial);
   }
 }
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_test(queue q, size_t N) {
+  max_local_test<T, order, scope>(q, N);
+  max_global_test<T, order, scope>(q, N);
+}
diff --git a/SYCL/AtomicRef/max_orders_scopes.cpp b/SYCL/AtomicRef/max_orders_scopes.cpp
new file mode 100644
index 0000000000..0320be5969
--- /dev/null
+++ b/SYCL/AtomicRef/max_orders_scopes.cpp
@@ -0,0 +1,42 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "max.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void max_test_scopes(queue q, size_t N) {
+  max_test<T, order, memory_scope::system>(q, N);
+  max_test<T, order, memory_scope::device>(q, N);
+  max_test<T, order, memory_scope::work_group>(q, N);
+  max_test<T, order, memory_scope::sub_group>(q, N);
+}
+
+template <typename T> void max_test_orders_scopes(queue q, size_t N) {
+  max_test_scopes<T, memory_order::relaxed>(q, N);
+  max_test_scopes<T, memory_order::acquire>(q, N);
+  max_test_scopes<T, memory_order::release>(q, N);
+  max_test_scopes<T, memory_order::acq_rel>(q, N);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  max_test_orders_scopes<int>(q, N);
+  max_test_orders_scopes<unsigned int>(q, N);
+  max_test_orders_scopes<long>(q, N);
+  max_test_orders_scopes<unsigned long>(q, N);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    max_test_orders_scopes<long long>(q, N);
+    max_test_orders_scopes<unsigned long long>(q, N);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
index a493cd8840..ac5d32bebf 100644
--- a/SYCL/AtomicRef/min.h
+++ b/SYCL/AtomicRef/min.h
@@ -9,7 +9,57 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-template <typename T> void min_test(queue q, size_t N) {
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_local_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::max();
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_min(T(gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_min returns original value
+  // Intermediate values should all be <= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] <= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
   T val = initial;
   std::vector<T> output(N);
@@ -24,9 +74,12 @@ template <typename T> void min_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
-                              access::address_space::global_space>(val[0]);
-        out[gid] = atm.fetch_min(T(gid));
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (val[0]);
+        out[gid] = atm.fetch_min(T(gid), order);
       });
     });
   }
@@ -43,3 +96,10 @@ template <typename T> void min_test(queue q, size_t N) {
     assert(output[i] <= initial);
   }
 }
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_test(queue q, size_t N) {
+  min_local_test<T, order, scope>(q, N);
+  min_global_test<T, order, scope>(q, N);
+}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/min_orders_scopes.cpp b/SYCL/AtomicRef/min_orders_scopes.cpp
new file mode 100644
index 0000000000..fdf7c620a5
--- /dev/null
+++ b/SYCL/AtomicRef/min_orders_scopes.cpp
@@ -0,0 +1,42 @@
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "min.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void min_test_scopes(queue q, size_t N) {
+  min_test<T, order, memory_scope::system>(q, N);
+  min_test<T, order, memory_scope::device>(q, N);
+  min_test<T, order, memory_scope::work_group>(q, N);
+  min_test<T, order, memory_scope::sub_group>(q, N);
+}
+
+template <typename T> void min_test_orders_scopes(queue q, size_t N) {
+  min_test_scopes<T, memory_order::relaxed>(q, N);
+  min_test_scopes<T, memory_order::acquire>(q, N);
+  min_test_scopes<T, memory_order::release>(q, N);
+  min_test_scopes<T, memory_order::acq_rel>(q, N);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  min_test_orders_scopes<int>(q, N);
+  min_test_orders_scopes<unsigned int>(q, N);
+  min_test_orders_scopes<long>(q, N);
+  min_test_orders_scopes<unsigned long>(q, N);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    min_test_orders_scopes<long long>(q, N);
+    min_test_orders_scopes<unsigned long long>(q, N);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h
new file mode 100644
index 0000000000..fa28b2f257
--- /dev/null
+++ b/SYCL/AtomicRef/or.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void or_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_or(T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each work-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void or_test(queue q) {
+  const size_t N = 32;
+  const T initial = 0;
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_or(T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each work-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
diff --git a/SYCL/AtomicRef/or_orders_scopes.cpp b/SYCL/AtomicRef/or_orders_scopes.cpp
new file mode 100644
index 0000000000..2adad06a45
--- /dev/null
+++ b/SYCL/AtomicRef/or_orders_scopes.cpp
@@ -0,0 +1,42 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "or.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void or_test_scopes(queue q) {
+  or_test<T, order, memory_scope::system>(q);
+  or_test<T, order, memory_scope::device>(q);
+  or_test<T, order, memory_scope::work_group>(q);
+  or_test<T, order, memory_scope::sub_group>(q);
+}
+
+template <typename T> void or_test_orders_scopes(queue q) {
+  or_test_scopes<T, memory_order::relaxed>(q);
+  or_test_scopes<T, memory_order::acquire>(q);
+  or_test_scopes<T, memory_order::release>(q);
+  or_test_scopes<T, memory_order::acq_rel>(q);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  or_test_orders_scopes<int>(q);
+  or_test_orders_scopes<unsigned int>(q);
+  or_test_orders_scopes<long>(q);
+  or_test_orders_scopes<unsigned long>(q);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    or_test_orders_scopes<long long>(q);
+    or_test_orders_scopes<unsigned long long>(q);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h
new file mode 100644
index 0000000000..d928d81453
--- /dev/null
+++ b/SYCL/AtomicRef/xor.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_xor(T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each wxork-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_global_test(queue q) {
+  const size_t N = 32;
+  const T initial = 0;
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_xor(T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each wxork-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_test(queue q) {
+  xor_local_test<T, order, scope>(q);
+  xor_global_test<T, order, scope>(q);
+}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/xor_orders_scopes.cpp b/SYCL/AtomicRef/xor_orders_scopes.cpp
new file mode 100644
index 0000000000..3bd70c0550
--- /dev/null
+++ b/SYCL/AtomicRef/xor_orders_scopes.cpp
@@ -0,0 +1,42 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "xor.h"
+#include <iostream>
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed>
+void xor_test_scopes(queue q) {
+  xor_test<T, order, memory_scope::system>(q);
+  xor_test<T, order, memory_scope::device>(q);
+  xor_test<T, order, memory_scope::work_group>(q);
+  xor_test<T, order, memory_scope::sub_group>(q);
+}
+
+template <typename T> void xor_test_orders_scopes(queue q) {
+  xor_test_scopes<T, memory_order::relaxed>(q);
+  xor_test_scopes<T, memory_order::acquire>(q);
+  xor_test_scopes<T, memory_order::release>(q);
+  xor_test_scopes<T, memory_order::acq_rel>(q);
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+  xor_test_orders_scopes<int>(q);
+  xor_test_orders_scopes<unsigned int>(q);
+  xor_test_orders_scopes<long>(q);
+  xor_test_orders_scopes<unsigned long>(q);
+
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    xor_test_orders_scopes<long long>(q);
+    xor_test_orders_scopes<unsigned long long>(q);
+  }
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
index d62c30e8b3..0725df16fb 100644
--- a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
index 74317e6d41..a8ff39fcdb 100644
--- a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
index 8bfffbc3e3..fafa75755b 100644
--- a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
index 359aa2f0fe..63dcd53ed3 100644
--- a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
index 22b2d32103..4b85b529b3 100644
--- a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
index b2fb2ba14d..057939f733 100644
--- a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
index c4b7a4ab6e..fdd26d3e91 100644
--- a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
index 79bc4eed55..048f6075f7 100644
--- a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_usm_dw.cpp b/SYCL/Reduction/reduction_range_usm_dw.cpp
index e50626464e..950142f74e 100644
--- a/SYCL/Reduction/reduction_range_usm_dw.cpp
+++ b/SYCL/Reduction/reduction_range_usm_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out

From a900c8fc40293a3431da938c3b9bdba5924b2cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 18 Nov 2021 15:01:20 +0000
Subject: [PATCH 02/27] [SYCL] merged add tests into one file

---
 SYCL/AtomicRef/add.cpp               | 459 ++++++++++++++++++++++++++-
 SYCL/AtomicRef/add.h                 | 231 --------------
 SYCL/AtomicRef/add_atomic64.cpp      |  41 ---
 SYCL/AtomicRef/add_orders_scopes.cpp |  49 ---
 4 files changed, 445 insertions(+), 335 deletions(-)
 delete mode 100644 SYCL/AtomicRef/add.h
 delete mode 100644 SYCL/AtomicRef/add_atomic64.cpp
 delete mode 100644 SYCL/AtomicRef/add_orders_scopes.cpp

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 64fc4fcd79..57c2fe488d 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -1,32 +1,463 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "add.h"
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <type_traits>
+#include <vector>
 #include <iostream>
+
 using namespace sycl;
 
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_fetch_local_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_add(Difference(1), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           sum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Fetch returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_fetch_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       cgh.parallel_for(range<1>(N), [=](item<1> it) {
+         int gid = it.get_id(0);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::global_space > (sum[0]);
+         out[gid] = atm.fetch_add(Difference(1), order);
+       });
+     }).wait_and_throw();
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Fetch returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == 0 && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_plus_equal_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
+        out[gid] = atm += Difference(1);
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // += returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_pre_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
+        out[gid] = ++atm;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Pre-increment returns updated value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_post_inc_test(queue q, size_t N) {
+  T sum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (sum[0]);
+        out[gid] = atm++;
+      });
+    });
+  }
+
+  // All work-items increment by 1, so final value should be equal to N
+  assert(sum == T(N));
+
+  // Post-increment returns original value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(0) && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void add_test(queue q, size_t N) {
+  add_fetch_local_test<T, Difference, order, scope>(q, N);
+  add_fetch_test<T, Difference, order, scope>(q, N);
+  add_plus_equal_test<T, Difference, order, scope>(q, N);
+  if constexpr (!std::is_floating_point_v<T>) {
+    add_pre_inc_test<T, Difference, order, scope>(q, N);
+    add_post_inc_test<T, Difference, order, scope>(q, N);
+  }
+}
+
+template <typename T, typename Difference = T, memory_order order = memory_order::relaxed>
+void add_test_scopes(queue q, size_t N) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test<T, Difference, order, memory_scope::system>(q,N);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test<T, Difference, order, memory_scope::work_group>(q,N);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test<T, Difference, order, memory_scope::sub_group>(q,N);
+#else
+  add_test<T, Difference, order, memory_scope::device>(q,N);
+#endif
+}
+
+template <typename T, typename Difference = T>
+void
+ add_test_orders_scopes(queue q, size_t N) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test_scopes<T, Difference, memory_order::acq_rel>(q,N);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test_scopes<T, Difference, memory_order::acquire>(q,N);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  add_test_scopes<T, Difference, memory_order::release>(q,N);
+#else
+  add_test_scopes<T, Difference, memory_order::relaxed>(q,N);
+#endif
+}
+
 int main() {
   queue q;
 
   constexpr int N = 32;
-  add_test<int>(q, N);
-  add_test<unsigned int>(q, N);
-  add_test<float>(q, N);
-
-  // Include long tests if they are 32 bits wide
-  if constexpr (sizeof(long) == 4) {
-    add_test<long>(q, N);
-    add_test<unsigned long>(q, N);
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  if constexpr (sizeof(long) == 8) {
+    add_test_orders_scopes<long>(q, N);
+    add_test_orders_scopes<unsigned long>(q, N);
   }
 
-  // Include pointer tests if they are 32 bits wide
-  if constexpr (sizeof(char *) == 4) {
-    add_test<char *, ptrdiff_t>(q, N);
+  // Include long long tests if they are 64 bits wide
+  if constexpr (sizeof(long long) == 8) {
+    add_test_orders_scopes<long long>(q, N);
+    add_test_orders_scopes<unsigned long long>(q, N);
   }
+#else
+  add_test_orders_scopes<int>(q, N);
+  add_test_orders_scopes<float>(q, N);
+  add_test_orders_scopes<unsigned int>(q, N);
+  add_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 4) {
+    add_test_orders_scopes<long>(q, N);
+    add_test_orders_scopes<unsigned long>(q, N);
+  }
+#endif
+	
 
   std::cout << "Test passed." << std::endl;
 }
+
diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
deleted file mode 100644
index 5a1bfd06d2..0000000000
--- a/SYCL/AtomicRef/add.h
+++ /dev/null
@@ -1,231 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <type_traits>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_fetch_local_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = 0;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.fetch_add(Difference(1), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           sum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Fetch returns original value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == 0 && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_fetch_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       cgh.parallel_for(range<1>(N), [=](item<1> it) {
-         int gid = it.get_id(0);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::global_space > (sum[0]);
-         out[gid] = atm.fetch_add(Difference(1), order);
-       });
-     }).wait_and_throw();
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Fetch returns original value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == 0 && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_plus_equal_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (sum[0]);
-        out[gid] = atm += Difference(1);
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // += returns updated value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_pre_inc_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (sum[0]);
-        out[gid] = ++atm;
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Pre-increment returns updated value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_post_inc_test(queue q, size_t N) {
-  T sum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> sum_buf(&sum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (sum[0]);
-        out[gid] = atm++;
-      });
-    });
-  }
-
-  // All work-items increment by 1, so final value should be equal to N
-  assert(sum == T(N));
-
-  // Post-increment returns original value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void add_test(queue q, size_t N) {
-  add_fetch_local_test<T, Difference, order, scope>(q, N);
-  add_fetch_test<T, Difference, order, scope>(q, N);
-  add_plus_equal_test<T, Difference, order, scope>(q, N);
-  if constexpr (!std::is_floating_point_v<T>) {
-    add_pre_inc_test<T, Difference, order, scope>(q, N);
-    add_post_inc_test<T, Difference, order, scope>(q, N);
-  }
-}
diff --git a/SYCL/AtomicRef/add_atomic64.cpp b/SYCL/AtomicRef/add_atomic64.cpp
deleted file mode 100644
index 13d47a406a..0000000000
--- a/SYCL/AtomicRef/add_atomic64.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "add.h"
-#include <iostream>
-using namespace sycl;
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  add_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    add_test<long>(q, N);
-    add_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    add_test<long long>(q, N);
-    add_test<unsigned long long>(q, N);
-  }
-
-  // Include pointer tests if they are 64 bits wide
-  if constexpr (sizeof(char *) == 8) {
-    add_test<char *, ptrdiff_t>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/add_orders_scopes.cpp b/SYCL/AtomicRef/add_orders_scopes.cpp
deleted file mode 100644
index 6be7c66186..0000000000
--- a/SYCL/AtomicRef/add_orders_scopes.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
-#include "add.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, typename Difference = T,
-          memory_order order = memory_order::relaxed>
-void add_test_scopes(queue q, size_t N) {
-  add_test<T, Difference, order, memory_scope::system>(q, N);
-  add_test<T, Difference, order, memory_scope::device>(q, N);
-  add_test<T, Difference, order, memory_scope::work_group>(q, N);
-  add_test<T, Difference, order, memory_scope::sub_group>(q, N);
-}
-
-template <typename T, typename Difference = T>
-void add_test_orders_scopes(queue q, size_t N) {
-  add_test_scopes<T, Difference, memory_order::relaxed>(q, N);
-  add_test_scopes<T, Difference, memory_order::acquire>(q, N);
-  add_test_scopes<T, Difference, memory_order::release>(q, N);
-  add_test_scopes<T, Difference, memory_order::acq_rel>(q, N);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  add_test_orders_scopes<int>(q, N);
-  add_test_orders_scopes<float>(q, N);
-  add_test_orders_scopes<unsigned int>(q, N);
-  add_test_orders_scopes<double>(q, N);
-  add_test_orders_scopes<long>(q, N);
-  add_test_orders_scopes<unsigned long>(q, N);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    add_test_orders_scopes<long long>(q, N);
-    add_test_orders_scopes<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}

From d7f7e345ac8291fe1f8f2519101dc1e594853ed1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 11:29:04 +0000
Subject: [PATCH 03/27] [SYCL] merged tests for other operations

---
 SYCL/AtomicRef/add.cpp                        |  15 +-
 SYCL/AtomicRef/and.cpp                        | 321 +++++++++++++++++
 SYCL/AtomicRef/and.h                          |  96 -----
 SYCL/AtomicRef/and_orders_scopes.cpp          |  42 ---
 SYCL/AtomicRef/compare_exchange.cpp           | 339 +++++++++++++++++-
 SYCL/AtomicRef/compare_exchange.h             | 113 ------
 SYCL/AtomicRef/compare_exchange_atomic64.cpp  |  40 ---
 .../compare_exchange_orders_scopes.cpp        |  46 ---
 SYCL/AtomicRef/exchange.cpp                   | 323 ++++++++++++++++-
 SYCL/AtomicRef/exchange.h                     |  98 -----
 SYCL/AtomicRef/exchange_atomic64.cpp          |  40 ---
 SYCL/AtomicRef/exchange_orders_scopes.cpp     |  48 ---
 SYCL/AtomicRef/max.cpp                        | 327 ++++++++++++++++-
 SYCL/AtomicRef/max.h                          | 107 ------
 SYCL/AtomicRef/max_atomic64.cpp               |  35 --
 SYCL/AtomicRef/max_orders_scopes.cpp          |  42 ---
 SYCL/AtomicRef/min.cpp                        | 325 ++++++++++++++++-
 SYCL/AtomicRef/min.h                          | 105 ------
 SYCL/AtomicRef/min_atomic64.cpp               |  35 --
 SYCL/AtomicRef/min_orders_scopes.cpp          |  42 ---
 SYCL/AtomicRef/or.cpp                         | 322 +++++++++++++++++
 SYCL/AtomicRef/or.h                           |  89 -----
 SYCL/AtomicRef/or_orders_scopes.cpp           |  42 ---
 SYCL/AtomicRef/xor.cpp                        | 322 +++++++++++++++++
 SYCL/AtomicRef/xor.h                          |  96 -----
 SYCL/AtomicRef/xor_orders_scopes.cpp          |  42 ---
 26 files changed, 2250 insertions(+), 1202 deletions(-)
 create mode 100644 SYCL/AtomicRef/and.cpp
 delete mode 100644 SYCL/AtomicRef/and.h
 delete mode 100644 SYCL/AtomicRef/and_orders_scopes.cpp
 delete mode 100644 SYCL/AtomicRef/compare_exchange.h
 delete mode 100644 SYCL/AtomicRef/compare_exchange_atomic64.cpp
 delete mode 100644 SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
 delete mode 100644 SYCL/AtomicRef/exchange.h
 delete mode 100644 SYCL/AtomicRef/exchange_atomic64.cpp
 delete mode 100644 SYCL/AtomicRef/exchange_orders_scopes.cpp
 delete mode 100644 SYCL/AtomicRef/max.h
 delete mode 100644 SYCL/AtomicRef/max_atomic64.cpp
 delete mode 100644 SYCL/AtomicRef/max_orders_scopes.cpp
 delete mode 100644 SYCL/AtomicRef/min.h
 delete mode 100644 SYCL/AtomicRef/min_atomic64.cpp
 delete mode 100644 SYCL/AtomicRef/min_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/or.cpp
 delete mode 100644 SYCL/AtomicRef/or.h
 delete mode 100644 SYCL/AtomicRef/or_orders_scopes.cpp
 create mode 100644 SYCL/AtomicRef/xor.cpp
 delete mode 100644 SYCL/AtomicRef/xor.h
 delete mode 100644 SYCL/AtomicRef/xor_orders_scopes.cpp

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 57c2fe488d..af0db82b8c 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -1,3 +1,6 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -436,28 +439,32 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
+  
+  add_test_orders_scopes<double>(q, N);
   if constexpr (sizeof(long) == 8) {
     add_test_orders_scopes<long>(q, N);
     add_test_orders_scopes<unsigned long>(q, N);
   }
-
-  // Include long long tests if they are 64 bits wide
   if constexpr (sizeof(long long) == 8) {
     add_test_orders_scopes<long long>(q, N);
     add_test_orders_scopes<unsigned long long>(q, N);
   }
+  if constexpr (sizeof(char *) == 8) {
+    add_test<char *, ptrdiff_t>(q, N);
+  }
 #else
   add_test_orders_scopes<int>(q, N);
   add_test_orders_scopes<float>(q, N);
   add_test_orders_scopes<unsigned int>(q, N);
-  add_test_orders_scopes<double>(q, N);
   if constexpr (sizeof(long) == 4) {
     add_test_orders_scopes<long>(q, N);
     add_test_orders_scopes<unsigned long>(q, N);
   }
+  if constexpr (sizeof(char *) == 4) {
+    add_test_orders_scopes<char *, ptrdiff_t>(q, N);
+  }
 #endif
 	
-
   std::cout << "Test passed." << std::endl;
 }
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
new file mode 100644
index 0000000000..89355aa9e4
--- /dev/null
+++ b/SYCL/AtomicRef/and.cpp
@@ -0,0 +1,321 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = T((1ll << N) - 1);
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_and(~T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // All other values should be unique; each work-item sets one bit to 0
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_global_test(queue q) {
+  const size_t N = 32;
+  const T initial = T((1ll << N) - 1);
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_and(~T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // All other values should be unique; each work-item sets one bit to 0
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void and_test(queue q) {
+  and_local_test<T, order, scope>(q);
+  and_global_test<T, order, scope>(q);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void and_test_scopes(queue q) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test<T, order, memory_scope::system>(q);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test<T, order, memory_scope::work_group>(q);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test<T, order, memory_scope::sub_group>(q);
+#else
+  and_test<T, order, memory_scope::device>(q);
+#endif
+}
+
+template <typename T> void and_test_orders_scopes(queue q) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test_scopes<T, memory_order::acq_rel>(q);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test_scopes<T, memory_order::acquire>(q);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  and_test_scopes<T, memory_order::release>(q);
+#else
+  and_test_scopes<T, memory_order::relaxed>(q);
+#endif
+}
+
+int main() {
+  queue q;
+  
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  
+  if constexpr (sizeof(long) == 8) {
+    and_test_orders_scopes<long>(q);
+    and_test_orders_scopes<unsigned long>(q);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    and_test_orders_scopes<long long>(q);
+    and_test_orders_scopes<unsigned long long>(q);
+  }
+#else
+  and_test_orders_scopes<int>(q);
+  and_test_orders_scopes<unsigned int>(q);
+  if constexpr (sizeof(long) == 4) {
+    and_test_orders_scopes<long>(q);
+    and_test_orders_scopes<unsigned long>(q);
+  }
+#endif
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h
deleted file mode 100644
index 3d5e7db2cf..0000000000
--- a/SYCL/AtomicRef/and.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void and_local_test(queue q) {
-  const size_t N = 32;
-  T cum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = T((1ll << N) - 1);
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.fetch_and(~T(1ll << gid), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Final value should be equal to 0
-  assert(cum == 0);
-
-  // All other values should be unique; each work-item sets one bit to 0
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void and_global_test(queue q) {
-  const size_t N = 32;
-  const T initial = T((1ll << N) - 1);
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (cum[0]);
-        out[gid] = atm.fetch_and(~T(1ll << gid), order);
-      });
-    });
-  }
-
-  // Final value should be equal to 0
-  assert(cum == 0);
-
-  // All other values should be unique; each work-item sets one bit to 0
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void and_test(queue q) {
-  and_local_test<T, order, scope>(q);
-  and_global_test<T, order, scope>(q);
-}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/and_orders_scopes.cpp b/SYCL/AtomicRef/and_orders_scopes.cpp
deleted file mode 100644
index 1b020e6176..0000000000
--- a/SYCL/AtomicRef/and_orders_scopes.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "and.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void and_test_scopes(queue q) {
-  and_test<T, order, memory_scope::system>(q);
-  and_test<T, order, memory_scope::device>(q);
-  and_test<T, order, memory_scope::work_group>(q);
-  and_test<T, order, memory_scope::sub_group>(q);
-}
-
-template <typename T> void and_test_orders_scopes(queue q) {
-  and_test_scopes<T, memory_order::relaxed>(q);
-  and_test_scopes<T, memory_order::acquire>(q);
-  and_test_scopes<T, memory_order::release>(q);
-  and_test_scopes<T, memory_order::acq_rel>(q);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  and_test_orders_scopes<int>(q);
-  and_test_orders_scopes<unsigned int>(q);
-  and_test_orders_scopes<long>(q);
-  and_test_orders_scopes<unsigned long>(q);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    and_test_orders_scopes<long long>(q);
-    and_test_orders_scopes<unsigned long long>(q);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 278f74e8c8..3df693f173 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -1,31 +1,348 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "compare_exchange.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
 #include <iostream>
+
 using namespace sycl;
 
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_local_test(queue q, size_t N) {
+  const T initial = T(N);
+  T compare_exchange = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> compare_exchange_buf(&compare_exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto compare_exchange =
+           compare_exchange_buf.template get_access<access::mode::read_write>(
+               cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         T result = T(N); // Avoid copying pointer
+         bool success = atm.compare_exchange_strong(result, (T)gid, order);
+         if (success) {
+           out[gid] = result;
+         } else {
+           out[gid] = T(gid);
+         }
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           compare_exchange[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be the index itself or the sentinel value
+  for (size_t i = 0; i < N; ++i) {
+    assert(output[i] == T(i) || output[i] == initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_global_test(queue q, size_t N) {
+  const T initial = T(N);
+  T compare_exchange = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> compare_exchange_buf(&compare_exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+       auto exc =
+           compare_exchange_buf.template get_access<access::mode::read_write>(
+               cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       cgh.parallel_for(range<1>(N), [=](item<1> it) {
+         size_t gid = it.get_id(0);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::global_space > (exc[0]);
+         T result = T(N); // Avoid copying pointer
+         bool success = atm.compare_exchange_strong(result, (T)gid, order);
+         if (success) {
+           out[gid] = result;
+         } else {
+           out[gid] = T(gid);
+         }
+       });
+     }).wait_and_throw();
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be the index itself or the sentinel value
+  for (size_t i = 0; i < N; ++i) {
+    assert(output[i] == T(i) || output[i] == initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void compare_exchange_test(queue q, size_t N) {
+  compare_exchange_local_test<T, order, scope>(q, N);
+  compare_exchange_global_test<T, order, scope>(q, N);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void compare_exchange_test_scopes(queue q, size_t N) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test<T, order, memory_scope::system>(q,N);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test<T, order, memory_scope::work_group>(q,N);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test<T, order, memory_scope::sub_group>(q,N);
+#else
+  compare_exchange_test<T, order, memory_scope::device>(q,N);
+#endif
+}
+
+template <typename T>
+void compare_exchange_test_orders_scopes(queue q, size_t N) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test_scopes<T, memory_order::acq_rel>(q,N);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test_scopes<T, memory_order::acquire>(q,N);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  compare_exchange_test_scopes<T, memory_order::release>(q,N);
+#else
+  compare_exchange_test_scopes<T, memory_order::relaxed>(q,N);
+#endif
+}
+
 int main() {
   queue q;
 
   constexpr int N = 32;
-  compare_exchange_test<int>(q, N);
-  compare_exchange_test<unsigned int>(q, N);
-  compare_exchange_test<float>(q, N);
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  compare_exchange_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 8) {
+    compare_exchange_test_orders_scopes<long>(q, N);
+    compare_exchange_test_orders_scopes<unsigned long>(q, N);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    compare_exchange_test_orders_scopes<long long>(q, N);
+    compare_exchange_test_orders_scopes<unsigned long long>(q, N);
+  }
+  if constexpr (sizeof(char *) == 8) {
+    compare_exchange_test_orders_scopes<char *>(q, N);
+  }
+#else
+  compare_exchange_test_orders_scopes<int>(q, N);
+  compare_exchange_test_orders_scopes<unsigned int>(q, N);
+  compare_exchange_test_orders_scopes<float>(q, N);
 
-  // Include long tests if they are 32 bits wide
   if constexpr (sizeof(long) == 4) {
-    compare_exchange_test<long>(q, N);
-    compare_exchange_test<unsigned long>(q, N);
+    compare_exchange_test_orders_scopes<long>(q, N);
+    compare_exchange_test_orders_scopes<unsigned long>(q, N);
   }
-
-  // Include pointer tests if they are 32 bits wide
   if constexpr (sizeof(char *) == 4) {
-    compare_exchange_test<char *>(q, N);
+    compare_exchange_test_orders_scopes<char *>(q, N);
   }
+#endif
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h
deleted file mode 100644
index 72107c8b18..0000000000
--- a/SYCL/AtomicRef/compare_exchange.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void compare_exchange_local_test(queue q, size_t N) {
-  const T initial = T(N);
-  T compare_exchange = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> compare_exchange_buf(&compare_exchange, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto compare_exchange =
-           compare_exchange_buf.template get_access<access::mode::read_write>(
-               cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = initial;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         T result = T(N); // Avoid copying pointer
-         bool success = atm.compare_exchange_strong(result, (T)gid, order);
-         if (success) {
-           out[gid] = result;
-         } else {
-           out[gid] = T(gid);
-         }
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           compare_exchange[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Only one work-item should have received the initial sentinel value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // All other values should be the index itself or the sentinel value
-  for (size_t i = 0; i < N; ++i) {
-    assert(output[i] == T(i) || output[i] == initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void compare_exchange_global_test(queue q, size_t N) {
-  const T initial = T(N);
-  T compare_exchange = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> compare_exchange_buf(&compare_exchange, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-       auto exc =
-           compare_exchange_buf.template get_access<access::mode::read_write>(
-               cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       cgh.parallel_for(range<1>(N), [=](item<1> it) {
-         size_t gid = it.get_id(0);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::global_space > (exc[0]);
-         T result = T(N); // Avoid copying pointer
-         bool success = atm.compare_exchange_strong(result, (T)gid, order);
-         if (success) {
-           out[gid] = result;
-         } else {
-           out[gid] = T(gid);
-         }
-       });
-     }).wait_and_throw();
-  }
-
-  // Only one work-item should have received the initial sentinel value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // All other values should be the index itself or the sentinel value
-  for (size_t i = 0; i < N; ++i) {
-    assert(output[i] == T(i) || output[i] == initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void compare_exchange_test(queue q, size_t N) {
-  compare_exchange_local_test<T, order, scope>(q, N);
-  compare_exchange_global_test<T, order, scope>(q, N);
-}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/compare_exchange_atomic64.cpp b/SYCL/AtomicRef/compare_exchange_atomic64.cpp
deleted file mode 100644
index 600b0920e4..0000000000
--- a/SYCL/AtomicRef/compare_exchange_atomic64.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "compare_exchange.h"
-#include <iostream>
-using namespace sycl;
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  compare_exchange_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    compare_exchange_test<long>(q, N);
-    compare_exchange_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    compare_exchange_test<long long>(q, N);
-    compare_exchange_test<unsigned long long>(q, N);
-  }
-
-  // Include pointer tests if they are 64 bits wide
-  if constexpr (sizeof(char *) == 8) {
-    compare_exchange_test<char *>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp b/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
deleted file mode 100644
index 15c36c6e7c..0000000000
--- a/SYCL/AtomicRef/compare_exchange_orders_scopes.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "compare_exchange.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void compare_exchange_test_scopes(queue q, size_t N) {
-  compare_exchange_test<T, order, memory_scope::system>(q, N);
-  compare_exchange_test<T, order, memory_scope::device>(q, N);
-  compare_exchange_test<T, order, memory_scope::work_group>(q, N);
-  compare_exchange_test<T, order, memory_scope::sub_group>(q, N);
-}
-
-template <typename T>
-void compare_exchange_test_orders_scopes(queue q, size_t N) {
-  compare_exchange_test_scopes<T, memory_order::relaxed>(q, N);
-  compare_exchange_test_scopes<T, memory_order::acquire>(q, N);
-  compare_exchange_test_scopes<T, memory_order::release>(q, N);
-  compare_exchange_test_scopes<T, memory_order::acq_rel>(q, N);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  compare_exchange_test_orders_scopes<int>(q, N);
-  compare_exchange_test_orders_scopes<unsigned int>(q, N);
-  compare_exchange_test_orders_scopes<float>(q, N);
-  compare_exchange_test_orders_scopes<long>(q, N);
-  compare_exchange_test_orders_scopes<unsigned long>(q, N);
-  compare_exchange_test_orders_scopes<char *>(q, N);
-  compare_exchange_test_orders_scopes<double>(q, N);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    compare_exchange_test_orders_scopes<long long>(q, N);
-    compare_exchange_test_orders_scopes<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 0c250bbf11..805c3ede18 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -1,31 +1,332 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "exchange.h"
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
 #include <iostream>
+
 using namespace sycl;
 
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_local_test(queue q, size_t N) {
+  const T initial = T(N);
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.exchange(T(gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be unique; each work-item replaces the value it
+  // reads with its own ID
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_global_test(queue q, size_t N) {
+  const T initial = T(N);
+  T exchange = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> exchange_buf(&exchange, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto exc =
+          exchange_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (exc[0]);
+        out[gid] = atm.exchange(T(gid), order);
+      });
+    });
+  }
+
+  // Only one work-item should have received the initial sentinel value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // All other values should be unique; each work-item replaces the value it
+  // reads with its own ID
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void exchange_test(queue q, size_t N) {
+  exchange_local_test<T, order, scope>(q, N);
+  exchange_global_test<T, order, scope>(q, N);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void exchange_test_scopes(queue q, size_t N) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test<T, order, memory_scope::system>(q,N);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test<T, order, memory_scope::work_group>(q,N);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test<T, order, memory_scope::sub_group>(q,N);
+#else
+  exchange_test<T, order, memory_scope::device>(q,N);
+#endif
+}
+
+template <typename T> void exchange_test_orders_scopes(queue q, size_t N) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test_scopes<T, memory_order::acq_rel>(q, N);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test_scopes<T, memory_order::acquire>(q, N);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  exchange_test_scopes<T, memory_order::release>(q, N);
+#else
+  exchange_test_scopes<T, memory_order::relaxed>(q, N);
+#endif
+}
+
 int main() {
   queue q;
 
   constexpr int N = 32;
-  exchange_test<int>(q, N);
-  exchange_test<unsigned int>(q, N);
-  exchange_test<float>(q, N);
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  exchange_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 8) {
+    exchange_test_orders_scopes<long>(q, N);
+    exchange_test_orders_scopes<unsigned long>(q, N);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    exchange_test_orders_scopes<long long>(q, N);
+    exchange_test_orders_scopes<unsigned long long>(q, N);
+  }
+  if constexpr (sizeof(char *) == 8) {
+    exchange_test_orders_scopes<char *>(q, N);
+  }
+#else
+  exchange_test_orders_scopes<int>(q, N);
+  exchange_test_orders_scopes<unsigned int>(q, N);
+  exchange_test_orders_scopes<float>(q, N);
 
-  // Include long tests if they are 32 bits wide
   if constexpr (sizeof(long) == 4) {
-    exchange_test<long>(q, N);
-    exchange_test<unsigned long>(q, N);
+    exchange_test_orders_scopes<long>(q, N);
+    exchange_test_orders_scopes<unsigned long>(q, N);
   }
-
-  // Include pointer tests if they are 32 bits wide
   if constexpr (sizeof(char *) == 4) {
-    exchange_test<char *>(q, N);
+    exchange_test_orders_scopes<char *>(q, N);
   }
+#endif
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h
deleted file mode 100644
index b8a2c7f81f..0000000000
--- a/SYCL/AtomicRef/exchange.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void exchange_local_test(queue q, size_t N) {
-  const T initial = T(N);
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = initial;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.exchange(T(gid), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Only one work-item should have received the initial sentinel value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // All other values should be unique; each work-item replaces the value it
-  // reads with its own ID
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void exchange_global_test(queue q, size_t N) {
-  const T initial = T(N);
-  T exchange = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> exchange_buf(&exchange, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto exc =
-          exchange_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (exc[0]);
-        out[gid] = atm.exchange(T(gid), order);
-      });
-    });
-  }
-
-  // Only one work-item should have received the initial sentinel value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // All other values should be unique; each work-item replaces the value it
-  // reads with its own ID
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void exchange_test(queue q, size_t N) {
-  exchange_local_test<T, order, scope>(q, N);
-  exchange_global_test<T, order, scope>(q, N);
-}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/exchange_atomic64.cpp b/SYCL/AtomicRef/exchange_atomic64.cpp
deleted file mode 100644
index 50d665a355..0000000000
--- a/SYCL/AtomicRef/exchange_atomic64.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "exchange.h"
-#include <iostream>
-using namespace sycl;
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  exchange_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    exchange_test<long>(q, N);
-    exchange_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    exchange_test<long long>(q, N);
-    exchange_test<unsigned long long>(q, N);
-  }
-
-  // Include pointer tests if they are 64 bits wide
-  if constexpr (sizeof(char *) == 8) {
-    exchange_test<char *>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/exchange_orders_scopes.cpp b/SYCL/AtomicRef/exchange_orders_scopes.cpp
deleted file mode 100644
index d3a9ee2c1d..0000000000
--- a/SYCL/AtomicRef/exchange_orders_scopes.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "exchange.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void exchange_test_scopes(queue q, size_t N) {
-  exchange_test<T, order, memory_scope::system>(q, N);
-  exchange_test<T, order, memory_scope::device>(q, N);
-  exchange_test<T, order, memory_scope::work_group>(q, N);
-  exchange_test<T, order, memory_scope::sub_group>(q, N);
-}
-
-template <typename T> void exchange_test_orders_scopes(queue q, size_t N) {
-  exchange_test_scopes<T, memory_order::relaxed>(q, N);
-  exchange_test_scopes<T, memory_order::acquire>(q, N);
-  exchange_test_scopes<T, memory_order::release>(q, N);
-  exchange_test_scopes<T, memory_order::acq_rel>(q, N);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  exchange_test_orders_scopes<int>(q, N);
-  exchange_test_orders_scopes<unsigned int>(q, N);
-  exchange_test_orders_scopes<float>(q, N);
-
-  exchange_test_orders_scopes<long>(q, N);
-  exchange_test_orders_scopes<unsigned long>(q, N);
-
-  exchange_test_orders_scopes<char *>(q, N);
-
-  exchange_test_orders_scopes<double>(q, N);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    exchange_test_orders_scopes<long long>(q, N);
-    exchange_test_orders_scopes<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 0e3517f922..81f1947fc9 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -1,26 +1,335 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "max.h"
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
 #include <iostream>
+
 using namespace sycl;
 
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_local_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::lowest();
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] =
+             atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  assert(cum == N - 1 + std::numeric_limits<T>::max() / 2);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_max returns original value
+  // Intermediate values should all be >= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] >= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_global_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::lowest();
+  T val = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), std::numeric_limits<T>::max());
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (val[0]);
+
+        // +max/2 to ensure correct signed/unsigned operation is applied
+        out[gid] =
+            atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
+      });
+    });
+  }
+
+  assert(val == N - 1 + std::numeric_limits<T>::max() / 2);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_max returns original value
+  // Intermediate values should all be >= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] >= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void max_test(queue q, size_t N) {
+  max_local_test<T, order, scope>(q, N);
+  max_global_test<T, order, scope>(q, N);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void max_test_scopes(queue q, size_t N) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test<T, order, memory_scope::system>(q,N);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test<T, order, memory_scope::work_group>(q,N);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test<T, order, memory_scope::sub_group>(q,N);
+#else
+  max_test<T, order, memory_scope::device>(q,N);
+#endif
+}
+
+template <typename T> void max_test_orders_scopes(queue q, size_t N) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test_scopes<T, memory_order::acq_rel>(q,N);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test_scopes<T, memory_order::acquire>(q,N);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  max_test_scopes<T, memory_order::release>(q,N);
+#else
+  max_test_scopes<T, memory_order::relaxed>(q,N);
+#endif
+}
+
 int main() {
   queue q;
 
   constexpr int N = 32;
-  max_test<int>(q, N);
-  max_test<unsigned int>(q, N);
-  max_test<float>(q, N);
-
-  // Include long tests if they are 32 bits wide
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  
+  max_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 8) {
+    max_test_orders_scopes<long>(q, N);
+    max_test_orders_scopes<unsigned long>(q, N);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    max_test_orders_scopes<long long>(q, N);
+    max_test_orders_scopes<unsigned long long>(q, N);
+  }
+#else
+  max_test_orders_scopes<int>(q, N);
+  max_test_orders_scopes<float>(q, N);
+  max_test_orders_scopes<unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
-    max_test<long>(q, N);
-    max_test<unsigned long>(q, N);
+    max_test_orders_scopes<long>(q, N);
+    max_test_orders_scopes<unsigned long>(q, N);
   }
+#endif
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
deleted file mode 100644
index d284a60096..0000000000
--- a/SYCL/AtomicRef/max.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void max_local_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::lowest();
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = initial;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] =
-             atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  assert(cum == N - 1 + std::numeric_limits<T>::max() / 2);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_max returns original value
-  // Intermediate values should all be >= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] >= initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void max_global_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::lowest();
-  T val = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), std::numeric_limits<T>::max());
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (val[0]);
-
-        // +max/2 to ensure correct signed/unsigned operation is applied
-        out[gid] =
-            atm.fetch_max(T(gid) + std::numeric_limits<T>::max() / 2, order);
-      });
-    });
-  }
-
-  assert(val == N - 1 + std::numeric_limits<T>::max() / 2);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_max returns original value
-  // Intermediate values should all be >= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] >= initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void max_test(queue q, size_t N) {
-  max_local_test<T, order, scope>(q, N);
-  max_global_test<T, order, scope>(q, N);
-}
diff --git a/SYCL/AtomicRef/max_atomic64.cpp b/SYCL/AtomicRef/max_atomic64.cpp
deleted file mode 100644
index 2439644d31..0000000000
--- a/SYCL/AtomicRef/max_atomic64.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "max.h"
-#include <iostream>
-using namespace sycl;
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  max_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    max_test<long>(q, N);
-    max_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    max_test<long long>(q, N);
-    max_test<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/max_orders_scopes.cpp b/SYCL/AtomicRef/max_orders_scopes.cpp
deleted file mode 100644
index 0320be5969..0000000000
--- a/SYCL/AtomicRef/max_orders_scopes.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "max.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void max_test_scopes(queue q, size_t N) {
-  max_test<T, order, memory_scope::system>(q, N);
-  max_test<T, order, memory_scope::device>(q, N);
-  max_test<T, order, memory_scope::work_group>(q, N);
-  max_test<T, order, memory_scope::sub_group>(q, N);
-}
-
-template <typename T> void max_test_orders_scopes(queue q, size_t N) {
-  max_test_scopes<T, memory_order::relaxed>(q, N);
-  max_test_scopes<T, memory_order::acquire>(q, N);
-  max_test_scopes<T, memory_order::release>(q, N);
-  max_test_scopes<T, memory_order::acq_rel>(q, N);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  max_test_orders_scopes<int>(q, N);
-  max_test_orders_scopes<unsigned int>(q, N);
-  max_test_orders_scopes<long>(q, N);
-  max_test_orders_scopes<unsigned long>(q, N);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    max_test_orders_scopes<long long>(q, N);
-    max_test_orders_scopes<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index d484911d96..9678867cfa 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -1,26 +1,333 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "min.h"
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
 #include <iostream>
+
 using namespace sycl;
 
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_local_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::max();
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = initial;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_min(T(gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to 0
+  assert(cum == 0);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_min returns original value
+  // Intermediate values should all be <= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] <= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_global_test(queue q, size_t N) {
+  T initial = std::numeric_limits<T>::max();
+  T val = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), 0);
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (val[0]);
+        out[gid] = atm.fetch_min(T(gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to 0
+  assert(val == 0);
+
+  // Only one work-item should have received the initial value
+  assert(std::count(output.begin(), output.end(), initial) == 1);
+
+  // fetch_min returns original value
+  // Intermediate values should all be <= initial value
+  for (int i = 0; i < N; ++i) {
+    assert(output[i] <= initial);
+  }
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void min_test(queue q, size_t N) {
+  min_local_test<T, order, scope>(q, N);
+  min_global_test<T, order, scope>(q, N);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void min_test_scopes(queue q, size_t N) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test<T, order, memory_scope::system>(q,N);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test<T, order, memory_scope::work_group>(q,N);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test<T, order, memory_scope::sub_group>(q,N);
+#else
+  min_test<T, order, memory_scope::device>(q,N);
+#endif
+}
+
+template <typename T> void min_test_orders_scopes(queue q, size_t N) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test_scopes<T, memory_order::acq_rel>(q,N);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test_scopes<T, memory_order::acquire>(q,N);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  min_test_scopes<T, memory_order::release>(q,N);
+#else
+  min_test_scopes<T, memory_order::relaxed>(q,N);
+#endif
+}
+
 int main() {
   queue q;
 
   constexpr int N = 32;
-  min_test<int>(q, N);
-  min_test<unsigned int>(q, N);
-  min_test<float>(q, N);
-
-  // Include long tests if they are 32 bits wide
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  
+  min_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 8) {
+    min_test_orders_scopes<long>(q, N);
+    min_test_orders_scopes<unsigned long>(q, N);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    min_test_orders_scopes<long long>(q, N);
+    min_test_orders_scopes<unsigned long long>(q, N);
+  }
+#else
+  min_test_orders_scopes<int>(q, N);
+  min_test_orders_scopes<float>(q, N);
+  min_test_orders_scopes<unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
-    min_test<long>(q, N);
-    min_test<unsigned long>(q, N);
+    min_test_orders_scopes<long>(q, N);
+    min_test_orders_scopes<unsigned long>(q, N);
   }
+#endif
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
deleted file mode 100644
index ac5d32bebf..0000000000
--- a/SYCL/AtomicRef/min.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void min_local_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::max();
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = initial;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.fetch_min(T(gid), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Final value should be equal to 0
-  assert(cum == 0);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_min returns original value
-  // Intermediate values should all be <= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] <= initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void min_global_test(queue q, size_t N) {
-  T initial = std::numeric_limits<T>::max();
-  T val = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), 0);
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (val[0]);
-        out[gid] = atm.fetch_min(T(gid), order);
-      });
-    });
-  }
-
-  // Final value should be equal to 0
-  assert(val == 0);
-
-  // Only one work-item should have received the initial value
-  assert(std::count(output.begin(), output.end(), initial) == 1);
-
-  // fetch_min returns original value
-  // Intermediate values should all be <= initial value
-  for (int i = 0; i < N; ++i) {
-    assert(output[i] <= initial);
-  }
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void min_test(queue q, size_t N) {
-  min_local_test<T, order, scope>(q, N);
-  min_global_test<T, order, scope>(q, N);
-}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/min_atomic64.cpp b/SYCL/AtomicRef/min_atomic64.cpp
deleted file mode 100644
index fc21b63d4e..0000000000
--- a/SYCL/AtomicRef/min_atomic64.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "min.h"
-#include <iostream>
-using namespace sycl;
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  min_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    min_test<long>(q, N);
-    min_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    min_test<long long>(q, N);
-    min_test<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/min_orders_scopes.cpp b/SYCL/AtomicRef/min_orders_scopes.cpp
deleted file mode 100644
index fdf7c620a5..0000000000
--- a/SYCL/AtomicRef/min_orders_scopes.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "min.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void min_test_scopes(queue q, size_t N) {
-  min_test<T, order, memory_scope::system>(q, N);
-  min_test<T, order, memory_scope::device>(q, N);
-  min_test<T, order, memory_scope::work_group>(q, N);
-  min_test<T, order, memory_scope::sub_group>(q, N);
-}
-
-template <typename T> void min_test_orders_scopes(queue q, size_t N) {
-  min_test_scopes<T, memory_order::relaxed>(q, N);
-  min_test_scopes<T, memory_order::acquire>(q, N);
-  min_test_scopes<T, memory_order::release>(q, N);
-  min_test_scopes<T, memory_order::acq_rel>(q, N);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  min_test_orders_scopes<int>(q, N);
-  min_test_orders_scopes<unsigned int>(q, N);
-  min_test_orders_scopes<long>(q, N);
-  min_test_orders_scopes<unsigned long>(q, N);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    min_test_orders_scopes<long long>(q, N);
-    min_test_orders_scopes<unsigned long long>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
new file mode 100644
index 0000000000..d6dea3e36d
--- /dev/null
+++ b/SYCL/AtomicRef/or.cpp
@@ -0,0 +1,322 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void or_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_or(T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each work-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void or_global_test(queue q) {
+  const size_t N = 32;
+  const T initial = 0;
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_or(T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each work-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void or_test(queue q) {
+  or_local_test<T, order, scope>(q);
+  or_global_test<T, order, scope>(q);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void or_test_scopes(queue q) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test<T, order, memory_scope::system>(q);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test<T, order, memory_scope::work_group>(q);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test<T, order, memory_scope::sub_group>(q);
+#else
+  or_test<T, order, memory_scope::device>(q);
+#endif
+}
+
+template <typename T> void or_test_orders_scopes(queue q) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test_scopes<T, memory_order::acq_rel>(q);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test_scopes<T, memory_order::acquire>(q);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  or_test_scopes<T, memory_order::release>(q);
+#else
+  or_test_scopes<T, memory_order::relaxed>(q);
+#endif
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  
+  if constexpr (sizeof(long) == 8) {
+    or_test_orders_scopes<long>(q);
+    or_test_orders_scopes<unsigned long>(q);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    or_test_orders_scopes<long long>(q);
+    or_test_orders_scopes<unsigned long long>(q);
+  }
+#else
+  or_test_orders_scopes<int>(q);
+  or_test_orders_scopes<unsigned int>(q);
+  if constexpr (sizeof(long) == 4) {
+    or_test_orders_scopes<long>(q);
+    or_test_orders_scopes<unsigned long>(q);
+  }
+#endif
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h
deleted file mode 100644
index fa28b2f257..0000000000
--- a/SYCL/AtomicRef/or.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void or_local_test(queue q) {
-  const size_t N = 32;
-  T cum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = 0;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.fetch_or(T(1ll << gid), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Final value should be equal to N ones
-  assert(cum == T((1ll << N) - 1));
-
-  // All other values should be unique; each work-item sets one bit to 1
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void or_test(queue q) {
-  const size_t N = 32;
-  const T initial = 0;
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (cum[0]);
-        out[gid] = atm.fetch_or(T(1ll << gid), order);
-      });
-    });
-  }
-
-  // Final value should be equal to N ones
-  assert(cum == T((1ll << N) - 1));
-
-  // All other values should be unique; each work-item sets one bit to 1
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
diff --git a/SYCL/AtomicRef/or_orders_scopes.cpp b/SYCL/AtomicRef/or_orders_scopes.cpp
deleted file mode 100644
index 2adad06a45..0000000000
--- a/SYCL/AtomicRef/or_orders_scopes.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "or.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void or_test_scopes(queue q) {
-  or_test<T, order, memory_scope::system>(q);
-  or_test<T, order, memory_scope::device>(q);
-  or_test<T, order, memory_scope::work_group>(q);
-  or_test<T, order, memory_scope::sub_group>(q);
-}
-
-template <typename T> void or_test_orders_scopes(queue q) {
-  or_test_scopes<T, memory_order::relaxed>(q);
-  or_test_scopes<T, memory_order::acquire>(q);
-  or_test_scopes<T, memory_order::release>(q);
-  or_test_scopes<T, memory_order::acq_rel>(q);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  or_test_orders_scopes<int>(q);
-  or_test_orders_scopes<unsigned int>(q);
-  or_test_orders_scopes<long>(q);
-  or_test_orders_scopes<unsigned long>(q);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    or_test_orders_scopes<long long>(q);
-    or_test_orders_scopes<unsigned long long>(q);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
new file mode 100644
index 0000000000..d600ca10d9
--- /dev/null
+++ b/SYCL/AtomicRef/xor.cpp
@@ -0,0 +1,322 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
+// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+using namespace sycl;
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_local_test(queue q) {
+  const size_t N = 32;
+  T cum = 0;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = 0;
+         it.barrier(access::fence_space::local_space);
+         auto atm = atomic_ref < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_xor(T(1ll << gid), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           cum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each wxork-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_global_test(queue q) {
+  const size_t N = 32;
+  const T initial = 0;
+  T cum = initial;
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> cum_buf(&cum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        size_t gid = it.get_id(0);
+        auto atm = atomic_ref < T,
+             (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope, access::address_space::global_space > (cum[0]);
+        out[gid] = atm.fetch_xor(T(1ll << gid), order);
+      });
+    });
+  }
+
+  // Final value should be equal to N ones
+  assert(cum == T((1ll << N) - 1));
+
+  // All other values should be unique; each wxork-item sets one bit to 1
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void xor_test(queue q) {
+  xor_local_test<T, order, scope>(q);
+  xor_global_test<T, order, scope>(q);
+}
+
+template <typename T, memory_order order = memory_order::relaxed>
+void xor_test_scopes(queue q) {
+	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test<T, order, memory_scope::system>(q);
+#elif defined(WORK_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test<T, order, memory_scope::work_group>(q);
+#elif defined(SUB_GROUP)
+  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test<T, order, memory_scope::sub_group>(q);
+#else
+  xor_test<T, order, memory_scope::device>(q);
+#endif
+}
+
+template <typename T> void xor_test_orders_scopes(queue q) {
+	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test_scopes<T, memory_order::acq_rel>(q);
+#elif defined(ACQUIRE)
+  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test_scopes<T, memory_order::acquire>(q);
+#elif defined(RELEASE)
+  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+    std::cout << "Skipping test\n";
+    return;
+  }
+  xor_test_scopes<T, memory_order::release>(q);
+#else
+  xor_test_scopes<T, memory_order::relaxed>(q);
+#endif
+}
+
+int main() {
+  queue q;
+
+  constexpr int N = 32;
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  
+  if constexpr (sizeof(long) == 8) {
+    xor_test_orders_scopes<long>(q);
+    xor_test_orders_scopes<unsigned long>(q);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    xor_test_orders_scopes<long long>(q);
+    xor_test_orders_scopes<unsigned long long>(q);
+  }
+#else
+  xor_test_orders_scopes<int>(q);
+  xor_test_orders_scopes<unsigned int>(q);
+  if constexpr (sizeof(long) == 4) {
+    xor_test_orders_scopes<long>(q);
+    xor_test_orders_scopes<unsigned long>(q);
+  }
+#endif
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h
deleted file mode 100644
index d928d81453..0000000000
--- a/SYCL/AtomicRef/xor.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void xor_local_test(queue q) {
-  const size_t N = 32;
-  T cum = 0;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(123456));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-    q.submit([&](handler &cgh) {
-       auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-       auto out =
-           output_buf.template get_access<access::mode::discard_write>(cgh);
-       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
-                                                                           cgh);
-
-       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
-         int gid = it.get_global_id(0);
-         if (gid == 0)
-           loc[0] = 0;
-         it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
-              (order == memory_order::acquire || order == memory_order::release)
-                  ? memory_order::relaxed
-                  : order,
-              scope, access::address_space::local_space > (loc[0]);
-         out[gid] = atm.fetch_xor(T(1ll << gid), order);
-         it.barrier(access::fence_space::local_space);
-         if (gid == 0)
-           cum[0] = loc[0];
-       });
-     }).wait_and_throw();
-  }
-
-  // Final value should be equal to N ones
-  assert(cum == T((1ll << N) - 1));
-
-  // All other values should be unique; each wxork-item sets one bit to 1
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void xor_global_test(queue q) {
-  const size_t N = 32;
-  const T initial = 0;
-  T cum = initial;
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> cum_buf(&cum, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
-             (order == memory_order::acquire || order == memory_order::release)
-                 ? memory_order::relaxed
-                 : order,
-             scope, access::address_space::global_space > (cum[0]);
-        out[gid] = atm.fetch_xor(T(1ll << gid), order);
-      });
-    });
-  }
-
-  // Final value should be equal to N ones
-  assert(cum == T((1ll << N) - 1));
-
-  // All other values should be unique; each wxork-item sets one bit to 1
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, memory_order order = memory_order::relaxed,
-          memory_scope scope = memory_scope::device>
-void xor_test(queue q) {
-  xor_local_test<T, order, scope>(q);
-  xor_global_test<T, order, scope>(q);
-}
\ No newline at end of file
diff --git a/SYCL/AtomicRef/xor_orders_scopes.cpp b/SYCL/AtomicRef/xor_orders_scopes.cpp
deleted file mode 100644
index 3bd70c0550..0000000000
--- a/SYCL/AtomicRef/xor_orders_scopes.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "xor.h"
-#include <iostream>
-using namespace sycl;
-
-template <typename T, memory_order order = memory_order::relaxed>
-void xor_test_scopes(queue q) {
-  xor_test<T, order, memory_scope::system>(q);
-  xor_test<T, order, memory_scope::device>(q);
-  xor_test<T, order, memory_scope::work_group>(q);
-  xor_test<T, order, memory_scope::sub_group>(q);
-}
-
-template <typename T> void xor_test_orders_scopes(queue q) {
-  xor_test_scopes<T, memory_order::relaxed>(q);
-  xor_test_scopes<T, memory_order::acquire>(q);
-  xor_test_scopes<T, memory_order::release>(q);
-  xor_test_scopes<T, memory_order::acq_rel>(q);
-}
-
-int main() {
-  queue q;
-
-  constexpr int N = 32;
-  xor_test_orders_scopes<int>(q);
-  xor_test_orders_scopes<unsigned int>(q);
-  xor_test_orders_scopes<long>(q);
-  xor_test_orders_scopes<unsigned long>(q);
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    xor_test_orders_scopes<long long>(q);
-    xor_test_orders_scopes<unsigned long long>(q);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}

From 0375249316941a74f74ca5fd0eb6cf7f8e71fb56 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 13:30:52 +0100
Subject: [PATCH 04/27] [SYCL] format

---
 SYCL/AtomicRef/add.cpp              | 59 +++++++++++++++++------------
 SYCL/AtomicRef/and.cpp              | 36 +++++++++++-------
 SYCL/AtomicRef/compare_exchange.cpp | 48 +++++++++++++----------
 SYCL/AtomicRef/exchange.cpp         | 40 +++++++++++--------
 SYCL/AtomicRef/max.cpp              | 50 ++++++++++++++----------
 SYCL/AtomicRef/min.cpp              | 50 ++++++++++++++----------
 SYCL/AtomicRef/or.cpp               | 34 +++++++++++------
 SYCL/AtomicRef/xor.cpp              | 34 +++++++++++------
 8 files changed, 215 insertions(+), 136 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index af0db82b8c..bd6d538b65 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -150,10 +152,10 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <type_traits>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -377,56 +379,64 @@ void add_test(queue q, size_t N) {
   }
 }
 
-template <typename T, typename Difference = T, memory_order order = memory_order::relaxed>
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed>
 void add_test_scopes(queue q, size_t N) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test<T, Difference, order, memory_scope::system>(q,N);
+  add_test<T, Difference, order, memory_scope::system>(q, N);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test<T, Difference, order, memory_scope::work_group>(q,N);
+  add_test<T, Difference, order, memory_scope::work_group>(q, N);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test<T, Difference, order, memory_scope::sub_group>(q,N);
+  add_test<T, Difference, order, memory_scope::sub_group>(q, N);
 #else
-  add_test<T, Difference, order, memory_scope::device>(q,N);
+  add_test<T, Difference, order, memory_scope::device>(q, N);
 #endif
 }
 
 template <typename T, typename Difference = T>
-void
- add_test_orders_scopes(queue q, size_t N) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+void add_test_orders_scopes(queue q, size_t N) {
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test_scopes<T, Difference, memory_order::acq_rel>(q,N);
+  add_test_scopes<T, Difference, memory_order::acq_rel>(q, N);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test_scopes<T, Difference, memory_order::acquire>(q,N);
+  add_test_scopes<T, Difference, memory_order::acquire>(q, N);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  add_test_scopes<T, Difference, memory_order::release>(q,N);
+  add_test_scopes<T, Difference, memory_order::release>(q, N);
 #else
-  add_test_scopes<T, Difference, memory_order::relaxed>(q,N);
+  add_test_scopes<T, Difference, memory_order::relaxed>(q, N);
 #endif
 }
 
@@ -439,7 +449,7 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   add_test_orders_scopes<double>(q, N);
   if constexpr (sizeof(long) == 8) {
     add_test_orders_scopes<long>(q, N);
@@ -464,7 +474,6 @@ int main() {
     add_test_orders_scopes<char *, ptrdiff_t>(q, N);
   }
 #endif
-	
+
   std::cout << "Test passed." << std::endl;
 }
-
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 89355aa9e4..d64353bb2e 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -242,21 +244,25 @@ void and_test(queue q) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void and_test_scopes(queue q) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   and_test<T, order, memory_scope::system>(q);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   and_test<T, order, memory_scope::work_group>(q);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -267,21 +273,25 @@ void and_test_scopes(queue q) {
 }
 
 template <typename T> void and_test_orders_scopes(queue q) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   and_test_scopes<T, memory_order::acq_rel>(q);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   and_test_scopes<T, memory_order::acquire>(q);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -293,13 +303,13 @@ template <typename T> void and_test_orders_scopes(queue q) {
 
 int main() {
   queue q;
-  
+
 #ifdef ATOMIC64
   if (!q.get_device().has(aspect::atomic64)) {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   if constexpr (sizeof(long) == 8) {
     and_test_orders_scopes<long>(q);
     and_test_orders_scopes<unsigned long>(q);
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 3df693f173..b85f65e299 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -259,53 +261,61 @@ void compare_exchange_test(queue q, size_t N) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void compare_exchange_test_scopes(queue q, size_t N) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test<T, order, memory_scope::system>(q,N);
+  compare_exchange_test<T, order, memory_scope::system>(q, N);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test<T, order, memory_scope::work_group>(q,N);
+  compare_exchange_test<T, order, memory_scope::work_group>(q, N);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test<T, order, memory_scope::sub_group>(q,N);
+  compare_exchange_test<T, order, memory_scope::sub_group>(q, N);
 #else
-  compare_exchange_test<T, order, memory_scope::device>(q,N);
+  compare_exchange_test<T, order, memory_scope::device>(q, N);
 #endif
 }
 
 template <typename T>
 void compare_exchange_test_orders_scopes(queue q, size_t N) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test_scopes<T, memory_order::acq_rel>(q,N);
+  compare_exchange_test_scopes<T, memory_order::acq_rel>(q, N);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test_scopes<T, memory_order::acquire>(q,N);
+  compare_exchange_test_scopes<T, memory_order::acquire>(q, N);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  compare_exchange_test_scopes<T, memory_order::release>(q,N);
+  compare_exchange_test_scopes<T, memory_order::release>(q, N);
 #else
-  compare_exchange_test_scopes<T, memory_order::relaxed>(q,N);
+  compare_exchange_test_scopes<T, memory_order::relaxed>(q, N);
 #endif
 }
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 805c3ede18..5b208b9e7b 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -244,46 +246,54 @@ void exchange_test(queue q, size_t N) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void exchange_test_scopes(queue q, size_t N) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  exchange_test<T, order, memory_scope::system>(q,N);
+  exchange_test<T, order, memory_scope::system>(q, N);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  exchange_test<T, order, memory_scope::work_group>(q,N);
+  exchange_test<T, order, memory_scope::work_group>(q, N);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  exchange_test<T, order, memory_scope::sub_group>(q,N);
+  exchange_test<T, order, memory_scope::sub_group>(q, N);
 #else
-  exchange_test<T, order, memory_scope::device>(q,N);
+  exchange_test<T, order, memory_scope::device>(q, N);
 #endif
 }
 
 template <typename T> void exchange_test_orders_scopes(queue q, size_t N) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   exchange_test_scopes<T, memory_order::acq_rel>(q, N);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   exchange_test_scopes<T, memory_order::acquire>(q, N);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 81f1947fc9..dfc5696ba9 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -253,52 +255,60 @@ void max_test(queue q, size_t N) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void max_test_scopes(queue q, size_t N) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test<T, order, memory_scope::system>(q,N);
+  max_test<T, order, memory_scope::system>(q, N);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test<T, order, memory_scope::work_group>(q,N);
+  max_test<T, order, memory_scope::work_group>(q, N);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test<T, order, memory_scope::sub_group>(q,N);
+  max_test<T, order, memory_scope::sub_group>(q, N);
 #else
-  max_test<T, order, memory_scope::device>(q,N);
+  max_test<T, order, memory_scope::device>(q, N);
 #endif
 }
 
 template <typename T> void max_test_orders_scopes(queue q, size_t N) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test_scopes<T, memory_order::acq_rel>(q,N);
+  max_test_scopes<T, memory_order::acq_rel>(q, N);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test_scopes<T, memory_order::acquire>(q,N);
+  max_test_scopes<T, memory_order::acquire>(q, N);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  max_test_scopes<T, memory_order::release>(q,N);
+  max_test_scopes<T, memory_order::release>(q, N);
 #else
-  max_test_scopes<T, memory_order::relaxed>(q,N);
+  max_test_scopes<T, memory_order::relaxed>(q, N);
 #endif
 }
 
@@ -311,7 +321,7 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   max_test_orders_scopes<double>(q, N);
   if constexpr (sizeof(long) == 8) {
     max_test_orders_scopes<long>(q, N);
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 9678867cfa..c5ffef16cf 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -251,52 +253,60 @@ void min_test(queue q, size_t N) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void min_test_scopes(queue q, size_t N) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test<T, order, memory_scope::system>(q,N);
+  min_test<T, order, memory_scope::system>(q, N);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test<T, order, memory_scope::work_group>(q,N);
+  min_test<T, order, memory_scope::work_group>(q, N);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test<T, order, memory_scope::sub_group>(q,N);
+  min_test<T, order, memory_scope::sub_group>(q, N);
 #else
-  min_test<T, order, memory_scope::device>(q,N);
+  min_test<T, order, memory_scope::device>(q, N);
 #endif
 }
 
 template <typename T> void min_test_orders_scopes(queue q, size_t N) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test_scopes<T, memory_order::acq_rel>(q,N);
+  min_test_scopes<T, memory_order::acq_rel>(q, N);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test_scopes<T, memory_order::acquire>(q,N);
+  min_test_scopes<T, memory_order::acquire>(q, N);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
-  min_test_scopes<T, memory_order::release>(q,N);
+  min_test_scopes<T, memory_order::release>(q, N);
 #else
-  min_test_scopes<T, memory_order::relaxed>(q,N);
+  min_test_scopes<T, memory_order::relaxed>(q, N);
 #endif
 }
 
@@ -309,7 +319,7 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   min_test_orders_scopes<double>(q, N);
   if constexpr (sizeof(long) == 8) {
     min_test_orders_scopes<long>(q, N);
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index d6dea3e36d..651c4d8d8a 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -242,21 +244,25 @@ void or_test(queue q) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void or_test_scopes(queue q) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   or_test<T, order, memory_scope::system>(q);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   or_test<T, order, memory_scope::work_group>(q);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -267,21 +273,25 @@ void or_test_scopes(queue q) {
 }
 
 template <typename T> void or_test_orders_scopes(queue q) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   or_test_scopes<T, memory_order::acq_rel>(q);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   or_test_scopes<T, memory_order::acquire>(q);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -300,7 +310,7 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   if constexpr (sizeof(long) == 8) {
     or_test_orders_scopes<long>(q);
     or_test_orders_scopes<unsigned long>(q);
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index d600ca10d9..da78d49a06 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -1,5 +1,7 @@
-// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel semantic order and sub_group/work_group/device/system scope is tested separately.
-// This is controlled by macros, defined by RUN commands. Defaults (no macro for a group) are: 32 bit, relaxed and device.
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
@@ -148,9 +150,9 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
+#include <iostream>
 #include <numeric>
 #include <vector>
-#include <iostream>
 
 using namespace sycl;
 
@@ -242,21 +244,25 @@ void xor_test(queue q) {
 
 template <typename T, memory_order order = memory_order::relaxed>
 void xor_test_scopes(queue q) {
-	std::vector<memory_scope> scopes = q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
 #if defined(SYSTEM)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   xor_test<T, order, memory_scope::system>(q);
 #elif defined(WORK_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   xor_test<T, order, memory_scope::work_group>(q);
 #elif defined(SUB_GROUP)
-  if(std::find(scopes.begin(), scopes.end(), memory_scope::system) == scopes.end()){
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -267,21 +273,25 @@ void xor_test_scopes(queue q) {
 }
 
 template <typename T> void xor_test_orders_scopes(queue q) {
-	std::vector<memory_order> orders = q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
 #if defined(ACQ_REL)
-  if(std::find(orders.begin(), orders.end(), memory_order::acq_rel) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   xor_test_scopes<T, memory_order::acq_rel>(q);
 #elif defined(ACQUIRE)
-  if(std::find(orders.begin(), orders.end(), memory_order::acquire) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
   xor_test_scopes<T, memory_order::acquire>(q);
 #elif defined(RELEASE)
-  if(std::find(orders.begin(), orders.end(), memory_order::release) == orders.end()){
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
     std::cout << "Skipping test\n";
     return;
   }
@@ -300,7 +310,7 @@ int main() {
     std::cout << "Skipping test\n";
     return 0;
   }
-  
+
   if constexpr (sizeof(long) == 8) {
     xor_test_orders_scopes<long>(q);
     xor_test_orders_scopes<unsigned long>(q);

From c215e68b4000b7cc9ae27c8853a4a51c2a54dcaa Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 13:39:27 +0100
Subject: [PATCH 05/27] [SYCL] add testing for both AtomicRef implementations

---
 SYCL/AtomicRef/add.cpp              | 40 ++++++++++++++++++-----------
 SYCL/AtomicRef/and.cpp              | 16 +++++++-----
 SYCL/AtomicRef/compare_exchange.cpp | 16 +++++++-----
 SYCL/AtomicRef/exchange.cpp         | 16 +++++++-----
 SYCL/AtomicRef/max.cpp              | 16 +++++++-----
 SYCL/AtomicRef/min.cpp              | 16 +++++++-----
 SYCL/AtomicRef/or.cpp               | 16 +++++++-----
 SYCL/AtomicRef/xor.cpp              | 16 +++++++-----
 8 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index af0db82b8c..b6fd9e3e7c 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -157,7 +157,8 @@
 
 using namespace sycl;
 
-template <typename T, typename Difference = T,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_fetch_local_test(queue q, size_t N) {
@@ -179,7 +180,7 @@ void add_fetch_local_test(queue q, size_t N) {
          if (gid == 0)
            loc[0] = 0;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -205,7 +206,8 @@ void add_fetch_local_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_fetch_test(queue q, size_t N) {
@@ -221,7 +223,7 @@ void add_fetch_test(queue q, size_t N) {
            output_buf.template get_access<access::mode::discard_write>(cgh);
        cgh.parallel_for(range<1>(N), [=](item<1> it) {
          int gid = it.get_id(0);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -244,7 +246,8 @@ void add_fetch_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_plus_equal_test(queue q, size_t N) {
@@ -261,7 +264,7 @@ void add_plus_equal_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -284,7 +287,8 @@ void add_plus_equal_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_pre_inc_test(queue q, size_t N) {
@@ -301,7 +305,7 @@ void add_pre_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -324,7 +328,8 @@ void add_pre_inc_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, typename Difference = T,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_post_inc_test(queue q, size_t N) {
@@ -341,7 +346,7 @@ void add_post_inc_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -368,12 +373,17 @@ template <typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_test(queue q, size_t N) {
-  add_fetch_local_test<T, Difference, order, scope>(q, N);
-  add_fetch_test<T, Difference, order, scope>(q, N);
-  add_plus_equal_test<T, Difference, order, scope>(q, N);
+  add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  add_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+  add_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+  add_plus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    add_pre_inc_test<T, Difference, order, scope>(q, N);
-    add_post_inc_test<T, Difference, order, scope>(q, N);
+    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+    add_pre_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+    add_post_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   }
 }
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 89355aa9e4..2960d862b3 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_local_test(queue q) {
   const size_t N = 32;
@@ -176,7 +177,7 @@ void and_local_test(queue q) {
          if (gid == 0)
            loc[0] = T((1ll << N) - 1);
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -197,7 +198,8 @@ void and_local_test(queue q) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_global_test(queue q) {
   const size_t N = 32;
@@ -215,7 +217,7 @@ void and_global_test(queue q) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -236,8 +238,10 @@ void and_global_test(queue q) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_test(queue q) {
-  and_local_test<T, order, scope>(q);
-  and_global_test<T, order, scope>(q);
+  and_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  and_local_test<::sycl::atomic_ref, T, order, scope>(q);
+  and_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  and_global_test<::sycl::atomic_ref, T, order, scope>(q);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 3df693f173..2b04c693f3 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_local_test(queue q, size_t N) {
   const T initial = T(N);
@@ -178,7 +179,7 @@ void compare_exchange_local_test(queue q, size_t N) {
          if (gid == 0)
            loc[0] = initial;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -206,7 +207,8 @@ void compare_exchange_local_test(queue q, size_t N) {
   }
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
@@ -225,7 +227,7 @@ void compare_exchange_global_test(queue q, size_t N) {
            output_buf.template get_access<access::mode::discard_write>(cgh);
        cgh.parallel_for(range<1>(N), [=](item<1> it) {
          size_t gid = it.get_id(0);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -253,8 +255,10 @@ void compare_exchange_global_test(queue q, size_t N) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_test(queue q, size_t N) {
-  compare_exchange_local_test<T, order, scope>(q, N);
-  compare_exchange_global_test<T, order, scope>(q, N);
+  compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  compare_exchange_local_test<::sycl::atomic_ref, T, order, scope>(q, N);
+  compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  compare_exchange_global_test<::sycl::atomic_ref, T, order, scope>(q, N);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 805c3ede18..1a3a074d65 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_local_test(queue q, size_t N) {
   const T initial = T(N);
@@ -176,7 +177,7 @@ void exchange_local_test(queue q, size_t N) {
          if (gid == 0)
            loc[0] = initial;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -198,7 +199,8 @@ void exchange_local_test(queue q, size_t N) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
@@ -216,7 +218,7 @@ void exchange_global_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -238,8 +240,10 @@ void exchange_global_test(queue q, size_t N) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_test(queue q, size_t N) {
-  exchange_local_test<T, order, scope>(q, N);
-  exchange_global_test<T, order, scope>(q, N);
+  exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  exchange_local_test<::sycl::atomic_ref, T, order, scope>(q, N);
+  exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  exchange_global_test<::sycl::atomic_ref, T, order, scope>(q, N);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 81f1947fc9..f6522a3a0f 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_local_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
@@ -176,7 +177,7 @@ void max_local_test(queue q, size_t N) {
          if (gid == 0)
            loc[0] = initial;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -202,7 +203,8 @@ void max_local_test(queue q, size_t N) {
   }
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
@@ -219,7 +221,7 @@ void max_global_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -247,8 +249,10 @@ void max_global_test(queue q, size_t N) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_test(queue q, size_t N) {
-  max_local_test<T, order, scope>(q, N);
-  max_global_test<T, order, scope>(q, N);
+  max_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  max_local_test<::sycl::atomic_ref, T, order, scope>(q, N);
+  max_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  max_global_test<::sycl::atomic_ref, T, order, scope>(q, N);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 9678867cfa..18eb8dd449 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_local_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
@@ -176,7 +177,7 @@ void min_local_test(queue q, size_t N) {
          if (gid == 0)
            loc[0] = initial;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -202,7 +203,8 @@ void min_local_test(queue q, size_t N) {
   }
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
@@ -219,7 +221,7 @@ void min_global_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -245,8 +247,10 @@ void min_global_test(queue q, size_t N) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_test(queue q, size_t N) {
-  min_local_test<T, order, scope>(q, N);
-  min_global_test<T, order, scope>(q, N);
+  min_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  min_local_test<::sycl::atomic_ref, T, order, scope>(q, N);
+  min_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  min_global_test<::sycl::atomic_ref, T, order, scope>(q, N);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index d6dea3e36d..834f3eb2d7 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_local_test(queue q) {
   const size_t N = 32;
@@ -176,7 +177,7 @@ void or_local_test(queue q) {
          if (gid == 0)
            loc[0] = 0;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -197,7 +198,8 @@ void or_local_test(queue q) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_global_test(queue q) {
   const size_t N = 32;
@@ -215,7 +217,7 @@ void or_global_test(queue q) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -236,8 +238,10 @@ void or_global_test(queue q) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_test(queue q) {
-  or_local_test<T, order, scope>(q);
-  or_global_test<T, order, scope>(q);
+  or_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  or_local_test<::sycl::atomic_ref, T, order, scope>(q);
+  or_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  or_global_test<::sycl::atomic_ref, T, order, scope>(q);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index d600ca10d9..ae577f290e 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -154,7 +154,8 @@
 
 using namespace sycl;
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_local_test(queue q) {
   const size_t N = 32;
@@ -176,7 +177,7 @@ void xor_local_test(queue q) {
          if (gid == 0)
            loc[0] = 0;
          it.barrier(access::fence_space::local_space);
-         auto atm = atomic_ref < T,
+         auto atm = AtomicRef < T,
               (order == memory_order::acquire || order == memory_order::release)
                   ? memory_order::relaxed
                   : order,
@@ -197,7 +198,8 @@ void xor_local_test(queue q) {
   assert(std::unique(output.begin(), output.end()) == output.end());
 }
 
-template <typename T, memory_order order = memory_order::relaxed,
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_global_test(queue q) {
   const size_t N = 32;
@@ -215,7 +217,7 @@ void xor_global_test(queue q) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         size_t gid = it.get_id(0);
-        auto atm = atomic_ref < T,
+        auto atm = AtomicRef < T,
              (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
@@ -236,8 +238,10 @@ void xor_global_test(queue q) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_test(queue q) {
-  xor_local_test<T, order, scope>(q);
-  xor_global_test<T, order, scope>(q);
+  xor_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  xor_local_test<::sycl::atomic_ref, T, order, scope>(q);
+  xor_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q);
+  xor_global_test<::sycl::atomic_ref, T, order, scope>(q);
 }
 
 template <typename T, memory_order order = memory_order::relaxed>

From d18ca3411cd774f77575d1ec365eeb9bfb1a950e Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 13:39:55 +0100
Subject: [PATCH 06/27] [SYCL] format

---
 SYCL/AtomicRef/add.cpp              | 30 +++++++++++++++++++----------
 SYCL/AtomicRef/and.cpp              |  6 ++++--
 SYCL/AtomicRef/compare_exchange.cpp | 12 ++++++++----
 SYCL/AtomicRef/exchange.cpp         |  6 ++++--
 SYCL/AtomicRef/max.cpp              |  6 ++++--
 SYCL/AtomicRef/min.cpp              |  6 ++++--
 SYCL/AtomicRef/or.cpp               |  6 ++++--
 SYCL/AtomicRef/xor.cpp              |  6 ++++--
 8 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index b6ede2eba5..2bacd90a9e 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -160,7 +160,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, typename Difference = T,
+          class AtomicRef,
+          typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_fetch_local_test(queue q, size_t N) {
@@ -209,7 +210,8 @@ void add_fetch_local_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, typename Difference = T,
+          class AtomicRef,
+          typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_fetch_test(queue q, size_t N) {
@@ -249,7 +251,8 @@ void add_fetch_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, typename Difference = T,
+          class AtomicRef,
+          typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_plus_equal_test(queue q, size_t N) {
@@ -290,7 +293,8 @@ void add_plus_equal_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, typename Difference = T,
+          class AtomicRef,
+          typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_pre_inc_test(queue q, size_t N) {
@@ -331,7 +335,8 @@ void add_pre_inc_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, typename Difference = T,
+          class AtomicRef,
+          typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_post_inc_test(queue q, size_t N) {
@@ -375,16 +380,21 @@ template <typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_test(queue q, size_t N) {
-  add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
+                       scope>(q, N);
   add_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+                 scope>(q, N);
   add_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+                      scope>(q, N);
   add_plus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+                     scope>(q, N);
     add_pre_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order, scope>(q, N);
+    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+                      scope>(q, N);
     add_post_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   }
 }
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index c74bdbf081..9e8d5a8903 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_local_test(queue q) {
   const size_t N = 32;
@@ -201,7 +202,8 @@ void and_local_test(queue q) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_global_test(queue q) {
   const size_t N = 32;
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 1fd7ff7472..95a5619b31 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_local_test(queue q, size_t N) {
   const T initial = T(N);
@@ -210,7 +211,8 @@ void compare_exchange_local_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
@@ -257,9 +259,11 @@ void compare_exchange_global_test(queue q, size_t N) {
 template <typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_test(queue q, size_t N) {
-  compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(
+      q, N);
   compare_exchange_local_test<::sycl::atomic_ref, T, order, scope>(q, N);
-  compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order, scope>(q, N);
+  compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, T, order,
+                               scope>(q, N);
   compare_exchange_global_test<::sycl::atomic_ref, T, order, scope>(q, N);
 }
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index b2f78ed3d6..424876d1d7 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_local_test(queue q, size_t N) {
   const T initial = T(N);
@@ -202,7 +203,8 @@ void exchange_local_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_global_test(queue q, size_t N) {
   const T initial = T(N);
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 6c6e9fdd3f..a76d68a21d 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_local_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
@@ -206,7 +207,8 @@ void max_local_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::lowest();
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 1e74a45ad0..b82aa11df4 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_local_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
@@ -206,7 +207,8 @@ void min_local_test(queue q, size_t N) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_global_test(queue q, size_t N) {
   T initial = std::numeric_limits<T>::max();
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index 11e3ad4803..07aa1c1565 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_local_test(queue q) {
   const size_t N = 32;
@@ -201,7 +202,8 @@ void or_local_test(queue q) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_global_test(queue q) {
   const size_t N = 32;
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 10c5e35804..111a92bfa8 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -157,7 +157,8 @@
 using namespace sycl;
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_local_test(queue q) {
   const size_t N = 32;
@@ -201,7 +202,8 @@ void xor_local_test(queue q) {
 }
 
 template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef, typename T, memory_order order = memory_order::relaxed,
+          class AtomicRef,
+          typename T, memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_global_test(queue q) {
   const size_t N = 32;

From f3e6079a12e754a57ad93d27fb5a899fea5014c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 12:52:56 +0000
Subject: [PATCH 07/27] [SYCL] fixed add test

---
 SYCL/AtomicRef/add.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 2bacd90a9e..10a9514f47 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -383,17 +383,17 @@ void add_test(queue q, size_t N) {
   add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                        scope>(q, N);
   add_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                  scope>(q, N);
   add_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                       scope>(q, N);
   add_plus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                      scope>(q, N);
     add_pre_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, T, Difference, order,
+    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                       scope>(q, N);
     add_post_inc_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   }

From 94b90b78138cdba498f727812558a94d55f1c508 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Fri, 19 Nov 2021 14:02:59 +0100
Subject: [PATCH 08/27] [SYCL] format

---
 SYCL/AtomicRef/add.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 10a9514f47..2de22a59e3 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -383,8 +383,8 @@ void add_test(queue q, size_t N) {
   add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                        scope>(q, N);
   add_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
-                 scope>(q, N);
+  add_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(
+      q, N);
   add_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
                       scope>(q, N);

From 0ff5fe0fec436ab04ccedfc600ece42605d188dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 9 Dec 2021 11:42:39 +0000
Subject: [PATCH 09/27] tests for remaining atomics

---
 SYCL/AtomicRef/max.cpp          |   2 +
 SYCL/AtomicRef/min.cpp          |   2 +
 SYCL/AtomicRef/sub.cpp          | 491 ++++++++++++++++++++++++++++++--
 SYCL/AtomicRef/sub.h            | 170 -----------
 SYCL/AtomicRef/sub_atomic64.cpp |  49 ----
 5 files changed, 477 insertions(+), 237 deletions(-)
 delete mode 100644 SYCL/AtomicRef/sub.h
 delete mode 100644 SYCL/AtomicRef/sub_atomic64.cpp

diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index a76d68a21d..8747604ec8 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index b82aa11df4..d5fa25569c 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index a3ff1dd2cc..645ab02efa 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -1,40 +1,495 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include "sub.h"
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
 #include <iostream>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
 using namespace sycl;
 
-// Floating-point types do not support pre- or post-decrement
-template <> void sub_test<float>(queue q, size_t N) {
-  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, float>(q, N);
-  sub_fetch_test<::sycl::atomic_ref, float>(q, N);
-  sub_plus_equal_test<::sycl::ext::oneapi::atomic_ref, float>(q, N);
-  sub_plus_equal_test<::sycl::atomic_ref, float>(q, N);
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef,
+          typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_fetch_local_test(queue q, size_t N) {
+  T sum = T(N);
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(123456));
+  {
+    buffer<T> sum_buf(&sum, 1);
+    buffer<T> output_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+       auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
+       auto out =
+           output_buf.template get_access<access::mode::discard_write>(cgh);
+       accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
+                                                                           cgh);
+
+       cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+         int gid = it.get_global_id(0);
+         if (gid == 0)
+           loc[0] = T(N);
+         it.barrier(access::fence_space::local_space);
+         auto atm = AtomicRef < T,
+              (order == memory_order::acquire || order == memory_order::release)
+                  ? memory_order::relaxed
+                  : order,
+              scope, access::address_space::local_space > (loc[0]);
+         out[gid] = atm.fetch_sub(Difference(1), order);
+         it.barrier(access::fence_space::local_space);
+         if (gid == 0)
+           sum[0] = loc[0];
+       });
+     }).wait_and_throw();
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(sum == T(0));
+
+  // Fetch returns original value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef,
+          typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_fetch_test(queue q, size_t N) {
+  T val = T(N);
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope,
+                             access::address_space::global_space>(val[0]);
+        out[gid] = atm.fetch_sub(Difference(1), order);
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == T(0));
+
+  // Fetch returns original value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef,
+          typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_minus_equal_test(queue q, size_t N) {
+  T val = T(N);
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope,
+                             access::address_space::global_space>(val[0]);
+        out[gid] = atm -= Difference(1);
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == T(0));
+
+  // -= returns updated value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(0) && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef,
+          typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_pre_dec_test(queue q, size_t N) {
+  T val = T(N);
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope,
+                             access::address_space::global_space>(val[0]);
+        out[gid] = --atm;
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == T(0));
+
+  // Pre-decrement returns updated value: will be in [0, N-1]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(0) && *max_e == T(N - 1));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <template <typename, memory_order, memory_scope, access::address_space>
+          class AtomicRef,
+          typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_post_dec_test(queue q, size_t N) {
+  T val = T(N);
+  std::vector<T> output(N);
+  std::fill(output.begin(), output.end(), T(0));
+  {
+    buffer<T> val_buf(&val, 1);
+    buffer<T> output_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
+      auto out =
+          output_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for(range<1>(N), [=](item<1> it) {
+        int gid = it.get_id(0);
+        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+                 ? memory_order::relaxed
+                 : order,
+             scope,
+                             access::address_space::global_space>(val[0]);
+        out[gid] = atm--;
+      });
+    });
+  }
+
+  // All work-items decrement by 1, so final value should be equal to 0
+  assert(val == T(0));
+
+  // Post-decrement returns original value: will be in [1, N]
+  auto min_e = std::min_element(output.begin(), output.end());
+  auto max_e = std::max_element(output.begin(), output.end());
+  assert(*min_e == T(1) && *max_e == T(N));
+
+  // Intermediate values should be unique
+  std::sort(output.begin(), output.end());
+  assert(std::unique(output.begin(), output.end()) == output.end());
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed,
+          memory_scope scope = memory_scope::device>
+void sub_test(queue q, size_t N) {
+  sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_minus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  if constexpr (!std::is_floating_point_v<T>) {
+    sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+    sub_pre_dec_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+    sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+    sub_post_dec_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
+  }
+}
+
+template <typename T, typename Difference = T,
+          memory_order order = memory_order::relaxed>
+void sub_test_scopes(queue q, size_t N) {
+  std::vector<memory_scope> scopes =
+      q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
+#if defined(SYSTEM)
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test<T, Difference, order, memory_scope::system>(q, N);
+#elif defined(WORK_GROUP)
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test<T, Difference, order, memory_scope::work_group>(q, N);
+#elif defined(SUB_GROUP)
+  if (std::find(scopes.begin(), scopes.end(), memory_scope::system) ==
+      scopes.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test<T, Difference, order, memory_scope::sub_group>(q, N);
+#else
+  sub_test<T, Difference, order, memory_scope::device>(q, N);
+#endif
+}
+
+template <typename T, typename Difference = T>
+void sub_test_orders_scopes(queue q, size_t N) {
+  std::vector<memory_order> orders =
+      q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
+#if defined(ACQ_REL)
+  if (std::find(orders.begin(), orders.end(), memory_order::acq_rel) ==
+      orders.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test_scopes<T, Difference, memory_order::acq_rel>(q, N);
+#elif defined(ACQUIRE)
+  if (std::find(orders.begin(), orders.end(), memory_order::acquire) ==
+      orders.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test_scopes<T, Difference, memory_order::acquire>(q, N);
+#elif defined(RELEASE)
+  if (std::find(orders.begin(), orders.end(), memory_order::release) ==
+      orders.end()) {
+    std::cout << "Skipping test\n";
+    return;
+  }
+  sub_test_scopes<T, Difference, memory_order::release>(q, N);
+#else
+  sub_test_scopes<T, Difference, memory_order::relaxed>(q, N);
+#endif
 }
 
 int main() {
   queue q;
 
   constexpr int N = 32;
-  sub_test<int>(q, N);
-  sub_test<unsigned int>(q, N);
-  sub_test<float>(q, N);
+#ifdef ATOMIC64
+  if (!q.get_device().has(aspect::atomic64)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
 
-  // Include long tests if they are 32 bits wide
+  sub_test_orders_scopes<double>(q, N);
+  if constexpr (sizeof(long) == 8) {
+    sub_test_orders_scopes<long>(q, N);
+    sub_test_orders_scopes<unsigned long>(q, N);
+  }
+  if constexpr (sizeof(long long) == 8) {
+    sub_test_orders_scopes<long long>(q, N);
+    sub_test_orders_scopes<unsigned long long>(q, N);
+  }
+  if constexpr (sizeof(char *) == 8) {
+    sub_test<char *, ptrdiff_t>(q, N);
+  }
+#else
+  sub_test_orders_scopes<int>(q, N);
+  sub_test_orders_scopes<float>(q, N);
+  sub_test_orders_scopes<unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
-    sub_test<long>(q, N);
-    sub_test<unsigned long>(q, N);
+    sub_test_orders_scopes<long>(q, N);
+    sub_test_orders_scopes<unsigned long>(q, N);
   }
-
-  // Include pointer tests if they are 32 bits wide
   if constexpr (sizeof(char *) == 4) {
-    sub_test<char *, ptrdiff_t>(q, N);
+    sub_test_orders_scopes<char *, ptrdiff_t>(q, N);
   }
+#endif
 
   std::cout << "Test passed." << std::endl;
 }
diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
deleted file mode 100644
index 2ea5790248..0000000000
--- a/SYCL/AtomicRef/sub.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#pragma once
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cassert>
-#include <numeric>
-#include <vector>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-
-template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef,
-          typename T, typename Difference = T>
-void sub_fetch_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
-                             access::address_space::global_space>(val[0]);
-        out[gid] = atm.fetch_sub(Difference(1));
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Fetch returns original value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef,
-          typename T, typename Difference = T>
-void sub_plus_equal_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
-                             access::address_space::global_space>(val[0]);
-        out[gid] = atm -= Difference(1);
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // -= returns updated value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef,
-          typename T, typename Difference = T>
-void sub_pre_dec_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
-                             access::address_space::global_space>(val[0]);
-        out[gid] = --atm;
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Pre-decrement returns updated value: will be in [0, N-1]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(0) && *max_e == T(N - 1));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <template <typename, memory_order, memory_scope, access::address_space>
-          class AtomicRef,
-          typename T, typename Difference = T>
-void sub_post_dec_test(queue q, size_t N) {
-  T val = T(N);
-  std::vector<T> output(N);
-  std::fill(output.begin(), output.end(), T(0));
-  {
-    buffer<T> val_buf(&val, 1);
-    buffer<T> output_buf(output.data(), output.size());
-
-    q.submit([&](handler &cgh) {
-      auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-      auto out =
-          output_buf.template get_access<access::mode::discard_write>(cgh);
-      cgh.parallel_for(range<1>(N), [=](item<1> it) {
-        int gid = it.get_id(0);
-        auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
-                             access::address_space::global_space>(val[0]);
-        out[gid] = atm--;
-      });
-    });
-  }
-
-  // All work-items decrement by 1, so final value should be equal to 0
-  assert(val == T(0));
-
-  // Post-decrement returns original value: will be in [1, N]
-  auto min_e = std::min_element(output.begin(), output.end());
-  auto max_e = std::max_element(output.begin(), output.end());
-  assert(*min_e == T(1) && *max_e == T(N));
-
-  // Intermediate values should be unique
-  std::sort(output.begin(), output.end());
-  assert(std::unique(output.begin(), output.end()) == output.end());
-}
-
-template <typename T, typename Difference = T>
-void sub_test(queue q, size_t N) {
-  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference>(q, N);
-  sub_fetch_test<::sycl::atomic_ref, T, Difference>(q, N);
-  sub_plus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference>(q, N);
-  sub_plus_equal_test<::sycl::atomic_ref, T, Difference>(q, N);
-  sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference>(q, N);
-  sub_pre_dec_test<::sycl::atomic_ref, T, Difference>(q, N);
-  sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference>(q, N);
-  sub_post_dec_test<::sycl::atomic_ref, T, Difference>(q, N);
-}
diff --git a/SYCL/AtomicRef/sub_atomic64.cpp b/SYCL/AtomicRef/sub_atomic64.cpp
deleted file mode 100644
index 8707e338ee..0000000000
--- a/SYCL/AtomicRef/sub_atomic64.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
-// RUN: -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include "sub.h"
-#include <iostream>
-using namespace sycl;
-
-// Floating-point types do not support pre- or post-decrement
-template <> void sub_test<double>(queue q, size_t N) {
-  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, double>(q, N);
-  sub_fetch_test<::sycl::atomic_ref, double>(q, N);
-  sub_plus_equal_test<::sycl::ext::oneapi::atomic_ref, double>(q, N);
-  sub_plus_equal_test<::sycl::atomic_ref, double>(q, N);
-}
-
-int main() {
-  queue q;
-
-  if (!q.get_device().has(aspect::atomic64)) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-
-  constexpr int N = 32;
-  sub_test<double>(q, N);
-
-  // Include long tests if they are 64 bits wide
-  if constexpr (sizeof(long) == 8) {
-    sub_test<long>(q, N);
-    sub_test<unsigned long>(q, N);
-  }
-
-  // Include long long tests if they are 64 bits wide
-  if constexpr (sizeof(long long) == 8) {
-    sub_test<long long>(q, N);
-    sub_test<unsigned long long>(q, N);
-  }
-
-  // Include pointer tests if they are 64 bits wide
-  if constexpr (sizeof(char *) == 8) {
-    sub_test<char *, ptrdiff_t>(q, N);
-  }
-
-  std::cout << "Test passed." << std::endl;
-}

From 5351b6d7645366fc3f29943d64ebe1ea7af448c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 21 Dec 2021 09:39:17 +0000
Subject: [PATCH 10/27] format

---
 SYCL/AtomicRef/sub.cpp | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 645ab02efa..5db63712cc 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -228,11 +228,11 @@ void sub_fetch_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+        auto atm = AtomicRef < T,
+             (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
-             scope,
-                             access::address_space::global_space>(val[0]);
+             scope, access::address_space::global_space > (val[0]);
         out[gid] = atm.fetch_sub(Difference(1), order);
       });
     });
@@ -270,11 +270,11 @@ void sub_minus_equal_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+        auto atm = AtomicRef < T,
+             (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
-             scope,
-                             access::address_space::global_space>(val[0]);
+             scope, access::address_space::global_space > (val[0]);
         out[gid] = atm -= Difference(1);
       });
     });
@@ -312,11 +312,11 @@ void sub_pre_dec_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+        auto atm = AtomicRef < T,
+             (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
-             scope,
-                             access::address_space::global_space>(val[0]);
+             scope, access::address_space::global_space > (val[0]);
         out[gid] = --atm;
       });
     });
@@ -354,11 +354,11 @@ void sub_post_dec_test(queue q, size_t N) {
           output_buf.template get_access<access::mode::discard_write>(cgh);
       cgh.parallel_for(range<1>(N), [=](item<1> it) {
         int gid = it.get_id(0);
-        auto atm = AtomicRef<T, (order == memory_order::acquire || order == memory_order::release)
+        auto atm = AtomicRef < T,
+             (order == memory_order::acquire || order == memory_order::release)
                  ? memory_order::relaxed
                  : order,
-             scope,
-                             access::address_space::global_space>(val[0]);
+             scope, access::address_space::global_space > (val[0]);
         out[gid] = atm--;
       });
     });
@@ -381,16 +381,21 @@ template <typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void sub_test(queue q, size_t N) {
-  sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
+                       scope>(q, N);
   sub_fetch_local_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(
+      q, N);
   sub_fetch_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-  sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+  sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
+                       scope>(q, N);
   sub_minus_equal_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+    sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
+                     scope>(q, N);
     sub_pre_dec_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
-    sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order, scope>(q, N);
+    sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, T, Difference, order,
+                      scope>(q, N);
     sub_post_dec_test<::sycl::atomic_ref, T, Difference, order, scope>(q, N);
   }
 }

From a8fb5f88172f0135dfebe96325a16d9b40086267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 4 Jan 2022 11:55:53 +0000
Subject: [PATCH 11/27] enabled add test for pointers with orders and scopes

---
 SYCL/AtomicRef/add.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 2de22a59e3..1ab825d30c 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -480,7 +480,7 @@ int main() {
     add_test_orders_scopes<unsigned long long>(q, N);
   }
   if constexpr (sizeof(char *) == 8) {
-    add_test<char *, ptrdiff_t>(q, N);
+    add_test_orders_scopes<char *, ptrdiff_t>(q, N);
   }
 #else
   add_test_orders_scopes<int>(q, N);

From 996581aac41a700f4865f44ed0871d03b72bcf2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 27 Jan 2022 08:57:00 +0000
Subject: [PATCH 12/27] fixed missing newlines at the end of files

---
 SYCL/AtomicRef/add.h              | 2 +-
 SYCL/AtomicRef/and.h              | 2 +-
 SYCL/AtomicRef/compare_exchange.h | 2 +-
 SYCL/AtomicRef/exchange.h         | 2 +-
 SYCL/AtomicRef/max.h              | 2 +-
 SYCL/AtomicRef/min.h              | 2 +-
 SYCL/AtomicRef/or.h               | 2 +-
 SYCL/AtomicRef/sub.h              | 2 +-
 SYCL/AtomicRef/xor.h              | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
index 26202992c6..9452620092 100644
--- a/SYCL/AtomicRef/add.h
+++ b/SYCL/AtomicRef/add.h
@@ -355,4 +355,4 @@ template <access::address_space space> void add_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h
index 4faaaa6fe7..a990336cfe 100755
--- a/SYCL/AtomicRef/and.h
+++ b/SYCL/AtomicRef/and.h
@@ -195,4 +195,4 @@ template <access::address_space space> void and_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h
index 775a80c32c..87a261313c 100644
--- a/SYCL/AtomicRef/compare_exchange.h
+++ b/SYCL/AtomicRef/compare_exchange.h
@@ -226,4 +226,4 @@ template <access::address_space space> void compare_exchange_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h
index 10486546c1..7967d502e5 100644
--- a/SYCL/AtomicRef/exchange.h
+++ b/SYCL/AtomicRef/exchange.h
@@ -210,4 +210,4 @@ template <access::address_space space> void exchange_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
index 4809d034ec..53f256d8ba 100644
--- a/SYCL/AtomicRef/max.h
+++ b/SYCL/AtomicRef/max.h
@@ -212,4 +212,4 @@ template <access::address_space space> void max_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
index f4261563b2..400d05982d 100644
--- a/SYCL/AtomicRef/min.h
+++ b/SYCL/AtomicRef/min.h
@@ -210,4 +210,4 @@ template <access::address_space space> void min_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h
index 2ea8fa9c31..5cf01adef6 100755
--- a/SYCL/AtomicRef/or.h
+++ b/SYCL/AtomicRef/or.h
@@ -196,4 +196,4 @@ template <access::address_space space> void or_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
index 967f194cf7..186a03a06f 100644
--- a/SYCL/AtomicRef/sub.h
+++ b/SYCL/AtomicRef/sub.h
@@ -354,4 +354,4 @@ template <access::address_space space> void sub_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}
diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h
index cd39dd07f9..01cf33e4b1 100755
--- a/SYCL/AtomicRef/xor.h
+++ b/SYCL/AtomicRef/xor.h
@@ -196,4 +196,4 @@ template <access::address_space space> void xor_test_all() {
 #endif
 
   std::cout << "Test passed." << std::endl;
-}
\ No newline at end of file
+}

From ccd5690bc4756887bcad045a4290b2a9058cf4c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 09:13:39 +0000
Subject: [PATCH 13/27] simplified RUN commands and added requirement for cuda
 backend

---
 SYCL/AtomicRef/add.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/add_generic.cpp              | 122 +++++---------------
 SYCL/AtomicRef/and.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/and_generic.cpp              | 122 +++++---------------
 SYCL/AtomicRef/compare_exchange.cpp         | 122 +++++---------------
 SYCL/AtomicRef/compare_exchange_generic.cpp | 122 +++++---------------
 SYCL/AtomicRef/exchange.cpp                 | 122 +++++---------------
 SYCL/AtomicRef/exchange_generic.cpp         | 122 +++++---------------
 SYCL/AtomicRef/max.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/max_generic.cpp              | 122 +++++---------------
 SYCL/AtomicRef/min.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/min_generic.cpp              | 122 +++++---------------
 SYCL/AtomicRef/or.cpp                       | 122 +++++---------------
 SYCL/AtomicRef/or_generic.cpp               | 122 +++++---------------
 SYCL/AtomicRef/sub.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/sub_generic.cpp              | 122 +++++---------------
 SYCL/AtomicRef/xor.cpp                      | 122 +++++---------------
 SYCL/AtomicRef/xor_generic.cpp              | 122 +++++---------------
 18 files changed, 468 insertions(+), 1728 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 04dfdf4aa2..af432d6aac 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 67d62e3959..21224b4eca 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index d57f6b2e45..ac36e6bac2 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #include "and.h"
 
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index 126f3394da..45c6618458 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -4,152 +4,82 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
+// REQUIRES: cuda
+
 #include "and.h"
 
 int main() { and_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 2f21b1853c..cbf7013a5f 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #include "compare_exchange.h"
 
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 60fe4f8f9d..1522093799 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 83b9510c7d..93d98a77e0 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #include "exchange.h"
 
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index 94e6e077e5..b79df62d79 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 352ceb36b4..ec5178b319 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index dbc7c65437..f0523b1ea0 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 0cd618a3a2..ed47d7f324 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index 791daeee24..24fc8acce8 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index 142f6d0909..d7efbcdba9 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #include "or.h"
 
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 1b7fb807cc..ab7edbd254 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 0010bc9f90..8f06a825b1 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index fe1d9c5bac..504de3d02b 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 61ebd9fe3b..10e5ba9109 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -4,148 +4,78 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cuda
 
 #include "xor.h"
 
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index 9916c04cd9..ea597613b1 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -4,152 +4,82 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RU%GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
+// REQUIRES: cuda
+
 #include "xor.h"
 
 int main() { xor_test_all<access::address_space::generic_space>(); }

From 81abc0db7808a4fa9716da380755617e39134b25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 09:39:26 +0000
Subject: [PATCH 14/27] fix typo

---
 SYCL/AtomicRef/add.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/add_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/and.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/and_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange.cpp         | 48 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange_generic.cpp | 48 ++++++++++-----------
 SYCL/AtomicRef/exchange.cpp                 | 48 ++++++++++-----------
 SYCL/AtomicRef/exchange_generic.cpp         | 48 ++++++++++-----------
 SYCL/AtomicRef/max.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/max_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/min.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/min_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/or.cpp                       | 48 ++++++++++-----------
 SYCL/AtomicRef/or_generic.cpp               | 48 ++++++++++-----------
 SYCL/AtomicRef/sub.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/sub_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/xor.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/xor_generic.cpp              | 48 ++++++++++-----------
 18 files changed, 432 insertions(+), 432 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index af432d6aac..23f25306e1 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 21224b4eca..6706687bf3 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index ac36e6bac2..e5da76b0fc 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index 45c6618458..287108a6f6 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index cbf7013a5f..a8dd890b42 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 1522093799..49b2a34b94 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 93d98a77e0..d0262b5452 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index b79df62d79..823ba6db2c 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index ec5178b319..45b75c4ca8 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index f0523b1ea0..97a376f1ad 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index ed47d7f324..2f9b2a1ba6 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index 24fc8acce8..e341b4bbb8 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index d7efbcdba9..fea22ab3af 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index ab7edbd254..5ccaea0c0e 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 8f06a825b1..e149ae38f9 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index 504de3d02b..b7b2261db5 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 10e5ba9109..c635e25665 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index ea597613b1..25fbdacd58 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RU%GPU_RUN_PLACEHOLDER %t.out
+// RUN %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip

From 8ba8f1a337fba245bd7c485b47cc636c290c091e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 09:48:25 +0000
Subject: [PATCH 15/27] fix another typo

---
 SYCL/AtomicRef/add.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/add_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/and.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/and_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange.cpp         | 48 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange_generic.cpp | 48 ++++++++++-----------
 SYCL/AtomicRef/exchange.cpp                 | 48 ++++++++++-----------
 SYCL/AtomicRef/exchange_generic.cpp         | 48 ++++++++++-----------
 SYCL/AtomicRef/max.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/max_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/min.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/min_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/or.cpp                       | 48 ++++++++++-----------
 SYCL/AtomicRef/or_generic.cpp               | 48 ++++++++++-----------
 SYCL/AtomicRef/sub.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/sub_generic.cpp              | 48 ++++++++++-----------
 SYCL/AtomicRef/xor.cpp                      | 48 ++++++++++-----------
 SYCL/AtomicRef/xor_generic.cpp              | 48 ++++++++++-----------
 18 files changed, 432 insertions(+), 432 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 23f25306e1..c5856edaf8 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 6706687bf3..87982f398a 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index e5da76b0fc..095cb4f967 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index 287108a6f6..61407a8846 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index a8dd890b42..2c2b5417a3 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 49b2a34b94..dea71e6d05 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index d0262b5452..516079cf54 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index 823ba6db2c..5af9c7797e 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 45b75c4ca8..6f8844d9d2 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index 97a376f1ad..434f53e972 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 2f9b2a1ba6..531b91a3b4 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index e341b4bbb8..cd10a27b83 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index fea22ab3af..9ec2799891 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 5ccaea0c0e..8976debbd0 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index e149ae38f9..e7109e8106 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index b7b2261db5..e120cffad1 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index c635e25665..9892ef6d10 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cuda
 
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index 25fbdacd58..1410a1201e 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -4,76 +4,76 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
-// RUN %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip

From ed4ecdb64b31cb644e4c0d4ec1739c49770b6033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 10:12:39 +0000
Subject: [PATCH 16/27] changed how cuda arguments are passed and removed cuda
 requirement

---
 SYCL/AtomicRef/add.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/add_generic.cpp              | 50 ++++++++++-----------
 SYCL/AtomicRef/and.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/and_generic.cpp              | 50 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange.cpp         | 50 ++++++++++-----------
 SYCL/AtomicRef/compare_exchange_generic.cpp | 50 ++++++++++-----------
 SYCL/AtomicRef/exchange.cpp                 | 50 ++++++++++-----------
 SYCL/AtomicRef/exchange_generic.cpp         | 50 ++++++++++-----------
 SYCL/AtomicRef/max.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/max_generic.cpp              | 50 ++++++++++-----------
 SYCL/AtomicRef/min.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/min_generic.cpp              | 50 ++++++++++-----------
 SYCL/AtomicRef/or.cpp                       | 50 ++++++++++-----------
 SYCL/AtomicRef/or_generic.cpp               | 50 ++++++++++-----------
 SYCL/AtomicRef/sub.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/sub_generic.cpp              | 50 ++++++++++-----------
 SYCL/AtomicRef/xor.cpp                      | 50 ++++++++++-----------
 SYCL/AtomicRef/xor_generic.cpp              | 50 ++++++++++-----------
 18 files changed, 432 insertions(+), 468 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index c5856edaf8..e3cefc3898 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "add.h"
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 87982f398a..9ffd18b0fb 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 095cb4f967..94053307d4 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #include "and.h"
 
 int main() { and_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index 61407a8846..e2f04b18f5 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -3,83 +3,81 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-// REQUIRES: cuda
-
 #include "and.h"
 
 int main() { and_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 2c2b5417a3..b995802262 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #include "compare_exchange.h"
 
 int main() { compare_exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index dea71e6d05..1484254203 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 516079cf54..da6c307c79 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #include "exchange.h"
 
 int main() { exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index 5af9c7797e..00f9833195 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 6f8844d9d2..13a3aaa151 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "max.h"
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index 434f53e972..1f86cb9505 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 531b91a3b4..8c2b58d1c6 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "min.h"
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index cd10a27b83..724b95a49f 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index 9ec2799891..c0f64524c4 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #include "or.h"
 
 int main() { or_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 8976debbd0..31255c6285 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index e7109e8106..3fc98c34ad 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "sub.h"
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index e120cffad1..abb5a018cc 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 9892ef6d10..b2f3015ee0 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -3,80 +3,78 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// REQUIRES: cuda
-
 #include "xor.h"
 
 int main() { xor_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index 1410a1201e..92868cfafb 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -3,83 +3,81 @@
 // separately. This is controlled by macros, defined by RUN commands. Defaults
 // (no macro for a group) are: 32 bit, relaxed and device.
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-// REQUIRES: cuda
-
 #include "xor.h"
 
 int main() { xor_test_all<access::address_space::generic_space>(); }

From 220b7225ddc58c25cc7da67139cc0d0448700d70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 10:27:31 +0000
Subject: [PATCH 17/27] restored all RUN lines

---
 SYCL/AtomicRef/add.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/add_generic.cpp              | 72 +++++++++++++++++++++
 SYCL/AtomicRef/and.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/and_generic.cpp              | 72 +++++++++++++++++++++
 SYCL/AtomicRef/compare_exchange.cpp         | 72 +++++++++++++++++++++
 SYCL/AtomicRef/compare_exchange_generic.cpp | 72 +++++++++++++++++++++
 SYCL/AtomicRef/exchange.cpp                 | 72 +++++++++++++++++++++
 SYCL/AtomicRef/exchange_generic.cpp         | 72 +++++++++++++++++++++
 SYCL/AtomicRef/max.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/max_generic.cpp              | 72 +++++++++++++++++++++
 SYCL/AtomicRef/min.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/min_generic.cpp              | 72 +++++++++++++++++++++
 SYCL/AtomicRef/or.cpp                       | 72 +++++++++++++++++++++
 SYCL/AtomicRef/or_generic.cpp               | 72 +++++++++++++++++++++
 SYCL/AtomicRef/sub.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/sub_generic.cpp              | 72 +++++++++++++++++++++
 SYCL/AtomicRef/xor.cpp                      | 72 +++++++++++++++++++++
 SYCL/AtomicRef/xor_generic.cpp              | 72 +++++++++++++++++++++
 18 files changed, 1296 insertions(+)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index e3cefc3898..095ac0290f 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 9ffd18b0fb..16e0ede5c3 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 94053307d4..5572beef25 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "and.h"
 
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index e2f04b18f5..c3dabf2491 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index b995802262..8956fb15fc 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "compare_exchange.h"
 
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 1484254203..83272c2fae 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index da6c307c79..90dd7bd6c0 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "exchange.h"
 
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index 00f9833195..cd9d7529a1 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 13a3aaa151..0d7bffcc31 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index 1f86cb9505..3cced689f7 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 8c2b58d1c6..dd9752a016 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index 724b95a49f..f7dcfbff23 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index c0f64524c4..2fe706620f 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "or.h"
 
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 31255c6285..8aec3ab8e0 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 3fc98c34ad..0e25ebb91b 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index abb5a018cc..a92ba36d0c 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index b2f3015ee0..7227f39b18 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 #include "xor.h"
 
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index 92868cfafb..6b80e398f5 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -4,76 +4,148 @@
 // (no macro for a group) are: 32 bit, relaxed and device.
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
 // XFAIL: cuda || hip

From 94e763f119de1ce8e7c07f4a69f5eb06e7691ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Thu, 10 Feb 2022 14:42:33 +0000
Subject: [PATCH 18/27] marked FP tests XFAIL for other backends

---
 SYCL/AtomicRef/add.cpp | 2 ++
 SYCL/AtomicRef/max.cpp | 2 ++
 SYCL/AtomicRef/min.cpp | 2 ++
 SYCL/AtomicRef/sub.cpp | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 095ac0290f..1e9dedf6cf 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// XFAIL: hip, level_zero, opencl
+
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "add.h"
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 0d7bffcc31..4ba1f6e3d2 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// XFAIL: hip, level_zero, opencl
+
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "max.h"
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index dd9752a016..1b201e7854 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// XFAIL: hip, level_zero, opencl
+
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "min.h"
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 0e25ebb91b..a66d6a45f2 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -147,6 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// XFAIL: hip, level_zero, opencl
+
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "sub.h"

From e8c2553dfdabc008ddd3cacf24697542b883ae6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Fri, 11 Feb 2022 11:23:21 +0000
Subject: [PATCH 19/27] correctly set XFAILs and fix reduction tests for other
 backends

---
 SYCL/AtomicRef/add.cpp                      | 4 +++-
 SYCL/AtomicRef/add_generic.cpp              | 2 +-
 SYCL/AtomicRef/and.cpp                      | 3 +++
 SYCL/AtomicRef/and_generic.cpp              | 2 +-
 SYCL/AtomicRef/compare_exchange.cpp         | 3 +++
 SYCL/AtomicRef/compare_exchange_generic.cpp | 2 +-
 SYCL/AtomicRef/exchange.cpp                 | 3 +++
 SYCL/AtomicRef/exchange_generic.cpp         | 2 +-
 SYCL/AtomicRef/max.cpp                      | 4 +++-
 SYCL/AtomicRef/max_generic.cpp              | 2 +-
 SYCL/AtomicRef/min.cpp                      | 4 +++-
 SYCL/AtomicRef/min_generic.cpp              | 2 +-
 SYCL/AtomicRef/or.cpp                       | 3 +++
 SYCL/AtomicRef/or_generic.cpp               | 2 +-
 SYCL/AtomicRef/sub.cpp                      | 4 +++-
 SYCL/AtomicRef/sub_generic.cpp              | 2 +-
 SYCL/AtomicRef/xor.cpp                      | 3 +++
 SYCL/AtomicRef/xor_generic.cpp              | 2 +-
 SYCL/Reduction/reduction_range_1d_s0_dw.cpp | 2 +-
 SYCL/Reduction/reduction_range_1d_s0_rw.cpp | 2 +-
 SYCL/Reduction/reduction_range_1d_s1_dw.cpp | 2 +-
 SYCL/Reduction/reduction_range_1d_s1_rw.cpp | 2 +-
 SYCL/Reduction/reduction_range_2d_s1_dw.cpp | 2 +-
 SYCL/Reduction/reduction_range_2d_s1_rw.cpp | 2 +-
 SYCL/Reduction/reduction_range_3d_s1_dw.cpp | 2 +-
 SYCL/Reduction/reduction_range_3d_s1_rw.cpp | 2 +-
 SYCL/Reduction/reduction_range_usm_dw.cpp   | 2 +-
 27 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 1e9dedf6cf..3dced31ca5 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -147,7 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 16e0ede5c3..293896c023 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 5572beef25..2e5820518d 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -147,6 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// Barrier is not supported on host.
+// XFAIL: host
+
 #include "and.h"
 
 int main() { and_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index c3dabf2491..7650c44e51 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #include "and.h"
 
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index 8956fb15fc..a11cab9d61 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -147,6 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// Barrier is not supported on host.
+// XFAIL: host
+
 #include "compare_exchange.h"
 
 int main() { compare_exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 83272c2fae..068be49282 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #include "compare_exchange.h"
 
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index 90dd7bd6c0..f335dd2a17 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -147,6 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// Barrier is not supported on host.
+// XFAIL: host
+
 #include "exchange.h"
 
 int main() { exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index cd9d7529a1..979e681780 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #include "exchange.h"
 
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 4ba1f6e3d2..be80ff4633 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -147,7 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index 3cced689f7..504e30b508 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index 1b201e7854..da7cf0a5ff 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -147,7 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index f7dcfbff23..ab08812c84 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index 2fe706620f..70af8e7628 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -147,6 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// Barrier is not supported on host.
+// XFAIL: host
+
 #include "or.h"
 
 int main() { or_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 8aec3ab8e0..5d3349b5da 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #include "or.h"
 
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index a66d6a45f2..1a2398aae4 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -147,7 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index a92ba36d0c..66e855426d 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 7227f39b18..4b1ae4ef32 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -147,6 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+// Barrier is not supported on host.
+// XFAIL: host
+
 #include "xor.h"
 
 int main() { xor_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index 6b80e398f5..aa6d6bb060 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -148,7 +148,7 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip
+// XFAIL: cuda || hip || level_zero || opencl
 
 #include "xor.h"
 
diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
index 0725df16fb..4ca9c38607 100644
--- a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
index a8ff39fcdb..89c1a8b169 100644
--- a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
index f70b1563ef..3f37374354 100644
--- a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
index 63dcd53ed3..e08a55033c 100644
--- a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
index 81e3c53bd2..e3aaf245b8 100644
--- a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
index 057939f733..3fc5fa3bf5 100644
--- a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
index fdd26d3e91..6a670a256a 100644
--- a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
+++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
index 048f6075f7..7cdca27869 100644
--- a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
+++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/Reduction/reduction_range_usm_dw.cpp b/SYCL/Reduction/reduction_range_usm_dw.cpp
index 63527449e2..bd7b76105d 100644
--- a/SYCL/Reduction/reduction_range_usm_dw.cpp
+++ b/SYCL/Reduction/reduction_range_usm_dw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_60
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out

From d1e7591e77bd7204ec1fe950001adb9397739c89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 15 Feb 2022 09:55:32 +0000
Subject: [PATCH 20/27] fixed generic tests

---
 SYCL/AtomicRef/add.h                        | 25 ++++++++++++---------
 SYCL/AtomicRef/add_generic.cpp              |  4 ++--
 SYCL/AtomicRef/and.h                        |  8 ++++---
 SYCL/AtomicRef/and_generic.cpp              |  4 ++--
 SYCL/AtomicRef/compare_exchange.h           | 10 +++++----
 SYCL/AtomicRef/compare_exchange_generic.cpp |  4 ++--
 SYCL/AtomicRef/exchange.h                   | 10 +++++----
 SYCL/AtomicRef/exchange_generic.cpp         |  4 ++--
 SYCL/AtomicRef/max.h                        | 10 +++++----
 SYCL/AtomicRef/max_generic.cpp              |  4 ++--
 SYCL/AtomicRef/min.h                        | 10 +++++----
 SYCL/AtomicRef/min_generic.cpp              |  4 ++--
 SYCL/AtomicRef/or.h                         |  8 ++++---
 SYCL/AtomicRef/or_generic.cpp               |  4 ++--
 SYCL/AtomicRef/sub.h                        | 25 ++++++++++++---------
 SYCL/AtomicRef/sub_generic.cpp              |  4 ++--
 SYCL/AtomicRef/xor.h                        |  8 ++++---
 SYCL/AtomicRef/xor_generic.cpp              |  4 ++--
 18 files changed, 87 insertions(+), 63 deletions(-)

diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
index 9452620092..8b67cecc2d 100644
--- a/SYCL/AtomicRef/add.h
+++ b/SYCL/AtomicRef/add.h
@@ -235,24 +235,29 @@ void add_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
-                       T, Difference, order, scope>(q, N);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_ext_tests) {
+    add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
+                         T, Difference, order, scope>(q, N);
+    add_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
+                   scope>(q, N);
+    add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                        order, scope>(q, N);
+  }
   add_fetch_local_test<::sycl::atomic_ref, space_for_local_tests, T, Difference,
                        order, scope>(q, N);
-  add_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
-                 scope>(q, N);
   add_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q, N);
-  add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                      order, scope>(q, N);
   add_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
       q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                     order, scope>(q, N);
+    if constexpr (do_ext_tests) {
+      add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                       order, scope>(q, N);
+      add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                        order, scope>(q, N);
+    }
     add_pre_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
                                                                              N);
-    add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                      order, scope>(q, N);
     add_post_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
         q, N);
   }
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 293896c023..9dc1534cc7 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h
index a990336cfe..f53c4378aa 100755
--- a/SYCL/AtomicRef/and.h
+++ b/SYCL/AtomicRef/and.h
@@ -100,10 +100,12 @@ void and_test(queue q) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  and_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                 order, scope>(q);
+  if constexpr (space != access::address_space::generic_space) {
+    and_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
+                   order, scope>(q);
+    and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  }
   and_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
   and_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
diff --git a/SYCL/AtomicRef/and_generic.cpp b/SYCL/AtomicRef/and_generic.cpp
index 7650c44e51..3a43a4f8c5 100755
--- a/SYCL/AtomicRef/and_generic.cpp
+++ b/SYCL/AtomicRef/and_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #include "and.h"
 
diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h
index 87a261313c..1a936be096 100644
--- a/SYCL/AtomicRef/compare_exchange.h
+++ b/SYCL/AtomicRef/compare_exchange.h
@@ -119,12 +119,14 @@ void compare_exchange_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref,
-                              space_for_local_tests, T, order, scope>(q, N);
+  if constexpr (space != access::address_space::generic_space) {
+    compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref,
+                                space_for_local_tests, T, order, scope>(q, N);
+    compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                                 order, scope>(q, N);
+  }
   compare_exchange_local_test<::sycl::atomic_ref, space_for_local_tests, T,
                               order, scope>(q, N);
-  compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order,
-                               scope>(q, N);
   compare_exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q,
                                                                            N);
 }
diff --git a/SYCL/AtomicRef/compare_exchange_generic.cpp b/SYCL/AtomicRef/compare_exchange_generic.cpp
index 068be49282..b580b9e33a 100644
--- a/SYCL/AtomicRef/compare_exchange_generic.cpp
+++ b/SYCL/AtomicRef/compare_exchange_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #include "compare_exchange.h"
 
diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h
index 7967d502e5..fd011322e2 100644
--- a/SYCL/AtomicRef/exchange.h
+++ b/SYCL/AtomicRef/exchange.h
@@ -104,12 +104,14 @@ void exchange_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  exchange_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                      order, scope>(q, N);
+  if constexpr (space != access::address_space::generic_space) {
+    exchange_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
+                        T, order, scope>(q, N);
+    exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order,
+                         scope>(q, N);
+  }
   exchange_local_test<::sycl::atomic_ref, space_for_local_tests, T, order,
                       scope>(q, N);
-  exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
-      q, N);
   exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
 }
 
diff --git a/SYCL/AtomicRef/exchange_generic.cpp b/SYCL/AtomicRef/exchange_generic.cpp
index 979e681780..b1813f4c1d 100644
--- a/SYCL/AtomicRef/exchange_generic.cpp
+++ b/SYCL/AtomicRef/exchange_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #include "exchange.h"
 
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
index 53f256d8ba..7d681cd7a6 100644
--- a/SYCL/AtomicRef/max.h
+++ b/SYCL/AtomicRef/max.h
@@ -113,12 +113,14 @@ void max_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  max_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                 order, scope>(q, N);
+  if constexpr (space != access::address_space::generic_space) {
+    max_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
+                   order, scope>(q, N);
+    max_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
+                                                                             N);
+  }
   max_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q,
                                                                              N);
-  max_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
-                                                                           N);
   max_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
 }
 
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index 504e30b508..db8aea34f2 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
index 400d05982d..3daa0c5aa6 100644
--- a/SYCL/AtomicRef/min.h
+++ b/SYCL/AtomicRef/min.h
@@ -110,12 +110,14 @@ void min_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  min_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                 order, scope>(q, N);
+  if constexpr (space != access::address_space::generic_space) {
+    min_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
+                   order, scope>(q, N);
+    min_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
+                                                                             N);
+  }
   min_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q,
                                                                              N);
-  min_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
-                                                                           N);
   min_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
 }
 
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index ab08812c84..ecc0ef8089 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h
index 5cf01adef6..6f765bd1c3 100755
--- a/SYCL/AtomicRef/or.h
+++ b/SYCL/AtomicRef/or.h
@@ -100,10 +100,12 @@ void or_test(queue q) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  or_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                order, scope>(q);
+  if constexpr (space != access::address_space::generic_space) {
+    or_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
+                  order, scope>(q);
+    or_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  }
   or_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  or_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
   or_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
diff --git a/SYCL/AtomicRef/or_generic.cpp b/SYCL/AtomicRef/or_generic.cpp
index 5d3349b5da..034a844bfd 100755
--- a/SYCL/AtomicRef/or_generic.cpp
+++ b/SYCL/AtomicRef/or_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #include "or.h"
 
diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
index 186a03a06f..8b992fa905 100644
--- a/SYCL/AtomicRef/sub.h
+++ b/SYCL/AtomicRef/sub.h
@@ -234,24 +234,29 @@ void sub_test(queue q, size_t N) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
-                       T, Difference, order, scope>(q, N);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_ext_tests) {
+    sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
+                         T, Difference, order, scope>(q, N);
+    sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
+                   scope>(q, N);
+    sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                         order, scope>(q, N);
+  }
   sub_fetch_local_test<::sycl::atomic_ref, space_for_local_tests, T, Difference,
                        order, scope>(q, N);
-  sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
-                 scope>(q, N);
   sub_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q, N);
-  sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                       order, scope>(q, N);
   sub_minus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
       q, N);
   if constexpr (!std::is_floating_point_v<T>) {
-    sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                     order, scope>(q, N);
+    if constexpr (do_ext_tests) {
+      sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                       order, scope>(q, N);
+      sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                        order, scope>(q, N);
+    }
     sub_pre_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
                                                                              N);
-    sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                      order, scope>(q, N);
     sub_post_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
         q, N);
   }
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index 66e855426d..a9af04bd87 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h
index 01cf33e4b1..d96335482d 100755
--- a/SYCL/AtomicRef/xor.h
+++ b/SYCL/AtomicRef/xor.h
@@ -100,10 +100,12 @@ void xor_test(queue q) {
       space == access::address_space::global_space
           ? access::address_space::local_space
           : space;
-  xor_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                 order, scope>(q);
+  if constexpr (space != access::address_space::generic_space) {
+    xor_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
+                   order, scope>(q);
+    xor_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  }
   xor_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  xor_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
   xor_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
diff --git a/SYCL/AtomicRef/xor_generic.cpp b/SYCL/AtomicRef/xor_generic.cpp
index aa6d6bb060..d91c6987e2 100755
--- a/SYCL/AtomicRef/xor_generic.cpp
+++ b/SYCL/AtomicRef/xor_generic.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA backend has had no support for the generic address space yet
-// XFAIL: cuda || hip || level_zero || opencl
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
 
 #include "xor.h"
 

From 970852112ab3d53350268e37230cd13d307fbda1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 15 Feb 2022 15:58:00 +0000
Subject: [PATCH 21/27] split tests

---
 SYCL/AtomicRef/add.cpp                        |   4 -
 SYCL/AtomicRef/add.h                          |  61 ++++---
 SYCL/AtomicRef/add_generic_local.cpp          | 159 +++++++++++++++++
 SYCL/AtomicRef/add_local.cpp                  | 158 +++++++++++++++++
 SYCL/AtomicRef/and.cpp                        |   3 -
 SYCL/AtomicRef/and.h                          |  36 ++--
 SYCL/AtomicRef/and_generic_local.cpp          | 158 +++++++++++++++++
 SYCL/AtomicRef/and_local.cpp                  | 155 +++++++++++++++++
 SYCL/AtomicRef/compare_exchange.cpp           |   3 -
 SYCL/AtomicRef/compare_exchange.h             |  41 +++--
 .../compare_exchange_generic_local.cpp        | 160 ++++++++++++++++++
 SYCL/AtomicRef/compare_exchange_local.cpp     | 155 +++++++++++++++++
 SYCL/AtomicRef/exchange.cpp                   |   3 -
 SYCL/AtomicRef/exchange.h                     |  38 +++--
 SYCL/AtomicRef/exchange_generic_local.cpp     | 158 +++++++++++++++++
 SYCL/AtomicRef/exchange_local.cpp             | 155 +++++++++++++++++
 SYCL/AtomicRef/max.cpp                        |   4 -
 SYCL/AtomicRef/max.h                          |  38 +++--
 SYCL/AtomicRef/max_generic_local.cpp          | 159 +++++++++++++++++
 SYCL/AtomicRef/max_local.cpp                  | 158 +++++++++++++++++
 SYCL/AtomicRef/min.cpp                        |   4 -
 SYCL/AtomicRef/min.h                          |  39 +++--
 SYCL/AtomicRef/min_generic_local.cpp          | 159 +++++++++++++++++
 SYCL/AtomicRef/min_local.cpp                  | 158 +++++++++++++++++
 SYCL/AtomicRef/or.cpp                         |   3 -
 SYCL/AtomicRef/or.h                           |  35 ++--
 SYCL/AtomicRef/or_generic_local.cpp           | 158 +++++++++++++++++
 SYCL/AtomicRef/or_local.cpp                   | 155 +++++++++++++++++
 SYCL/AtomicRef/sub.cpp                        |   4 -
 SYCL/AtomicRef/sub.h                          |  63 ++++---
 SYCL/AtomicRef/sub_generic_local.cpp          | 157 +++++++++++++++++
 SYCL/AtomicRef/sub_local.cpp                  | 158 +++++++++++++++++
 SYCL/AtomicRef/xor.cpp                        |   3 -
 SYCL/AtomicRef/xor.h                          |  36 ++--
 SYCL/AtomicRef/xor_generic_local.cpp          | 158 +++++++++++++++++
 SYCL/AtomicRef/xor_local.cpp                  | 155 +++++++++++++++++
 36 files changed, 3089 insertions(+), 162 deletions(-)
 create mode 100755 SYCL/AtomicRef/add_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/add_local.cpp
 create mode 100755 SYCL/AtomicRef/and_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/and_local.cpp
 create mode 100755 SYCL/AtomicRef/compare_exchange_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/compare_exchange_local.cpp
 create mode 100755 SYCL/AtomicRef/exchange_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/exchange_local.cpp
 create mode 100755 SYCL/AtomicRef/max_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/max_local.cpp
 create mode 100755 SYCL/AtomicRef/min_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/min_local.cpp
 create mode 100755 SYCL/AtomicRef/or_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/or_local.cpp
 create mode 100755 SYCL/AtomicRef/sub_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/sub_local.cpp
 create mode 100755 SYCL/AtomicRef/xor_generic_local.cpp
 create mode 100755 SYCL/AtomicRef/xor_local.cpp

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 3dced31ca5..095ac0290f 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -147,10 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "add.h"
diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
index 8b67cecc2d..57eab02f4f 100644
--- a/SYCL/AtomicRef/add.h
+++ b/SYCL/AtomicRef/add.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -231,35 +235,44 @@ template <access::address_space space, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void add_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
   constexpr bool do_ext_tests = space != access::address_space::generic_space;
-  if constexpr (do_ext_tests) {
-    add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
-                         T, Difference, order, scope>(q, N);
-    add_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
-                   scope>(q, N);
-    add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                        order, scope>(q, N);
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      add_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                           Difference, order, scope>(q, N);
+    }
+    add_fetch_local_test<::sycl::atomic_ref, space, T, Difference, order,
+                         scope>(q, N);
   }
-  add_fetch_local_test<::sycl::atomic_ref, space_for_local_tests, T, Difference,
-                       order, scope>(q, N);
-  add_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q, N);
-  add_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
-      q, N);
-  if constexpr (!std::is_floating_point_v<T>) {
+  if constexpr (do_global_tests) {
     if constexpr (do_ext_tests) {
-      add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                       order, scope>(q, N);
-      add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                        order, scope>(q, N);
+      add_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                     order, scope>(q, N);
+      add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                          order, scope>(q, N);
+      if constexpr (!std::is_floating_point_v<T>) {
+        add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                         order, scope>(q, N);
+        add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                          order, scope>(q, N);
+      }
     }
-    add_pre_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
-                                                                             N);
-    add_post_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+    add_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
+                                                                           N);
+    add_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
         q, N);
+    if constexpr (!std::is_floating_point_v<T>) {
+      add_pre_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+          q, N);
+      add_post_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+          q, N);
+    }
   }
 }
 
diff --git a/SYCL/AtomicRef/add_generic_local.cpp b/SYCL/AtomicRef/add_generic_local.cpp
new file mode 100755
index 0000000000..2253fd8cbb
--- /dev/null
+++ b/SYCL/AtomicRef/add_generic_local.cpp
@@ -0,0 +1,159 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/add_local.cpp b/SYCL/AtomicRef/add_local.cpp
new file mode 100755
index 0000000000..bac4fd284c
--- /dev/null
+++ b/SYCL/AtomicRef/add_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+//#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/and.cpp b/SYCL/AtomicRef/and.cpp
index 2e5820518d..5572beef25 100644
--- a/SYCL/AtomicRef/and.cpp
+++ b/SYCL/AtomicRef/and.cpp
@@ -147,9 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host.
-// XFAIL: host
-
 #include "and.h"
 
 int main() { and_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/and.h b/SYCL/AtomicRef/and.h
index f53c4378aa..adcc67f500 100755
--- a/SYCL/AtomicRef/and.h
+++ b/SYCL/AtomicRef/and.h
@@ -1,3 +1,9 @@
+#pragma once
+
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -96,17 +102,27 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void and_test(queue q) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    and_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                   order, scope>(q);
-    and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      and_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q);
+    }
+    and_local_test<::sycl::atomic_ref, space, T, order, scope>(q);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q);
+    }
+    and_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
   }
-  and_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  and_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/and_generic_local.cpp b/SYCL/AtomicRef/and_generic_local.cpp
new file mode 100755
index 0000000000..c25cd9b407
--- /dev/null
+++ b/SYCL/AtomicRef/and_generic_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "and.h"
+
+int main() { and_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/and_local.cpp b/SYCL/AtomicRef/and_local.cpp
new file mode 100755
index 0000000000..3f01a5e6fe
--- /dev/null
+++ b/SYCL/AtomicRef/and_local.cpp
@@ -0,0 +1,155 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host.
+// XFAIL: host
+
+#include "and.h"
+
+int main() { and_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange.cpp b/SYCL/AtomicRef/compare_exchange.cpp
index a11cab9d61..8956fb15fc 100644
--- a/SYCL/AtomicRef/compare_exchange.cpp
+++ b/SYCL/AtomicRef/compare_exchange.cpp
@@ -147,9 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host.
-// XFAIL: host
-
 #include "compare_exchange.h"
 
 int main() { compare_exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/compare_exchange.h b/SYCL/AtomicRef/compare_exchange.h
index 1a936be096..12c49aca9b 100644
--- a/SYCL/AtomicRef/compare_exchange.h
+++ b/SYCL/AtomicRef/compare_exchange.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -115,20 +119,29 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void compare_exchange_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref,
-                                space_for_local_tests, T, order, scope>(q, N);
-    compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T,
-                                 order, scope>(q, N);
-  }
-  compare_exchange_local_test<::sycl::atomic_ref, space_for_local_tests, T,
-                              order, scope>(q, N);
-  compare_exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q,
-                                                                           N);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      compare_exchange_local_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                                  order, scope>(q, N);
+    }
+    compare_exchange_local_test<::sycl::atomic_ref, space, T, order, scope>(q,
+                                                                            N);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      compare_exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                                   order, scope>(q, N);
+    }
+    compare_exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q,
+                                                                             N);
+  }
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/compare_exchange_generic_local.cpp b/SYCL/AtomicRef/compare_exchange_generic_local.cpp
new file mode 100755
index 0000000000..f0df7d09ac
--- /dev/null
+++ b/SYCL/AtomicRef/compare_exchange_generic_local.cpp
@@ -0,0 +1,160 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "compare_exchange.h"
+
+int main() {
+  compare_exchange_test_all<access::address_space::generic_space>();
+}
diff --git a/SYCL/AtomicRef/compare_exchange_local.cpp b/SYCL/AtomicRef/compare_exchange_local.cpp
new file mode 100755
index 0000000000..59d7d0d791
--- /dev/null
+++ b/SYCL/AtomicRef/compare_exchange_local.cpp
@@ -0,0 +1,155 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host.
+// XFAIL: host
+
+#include "compare_exchange.h"
+
+int main() { compare_exchange_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/exchange.cpp b/SYCL/AtomicRef/exchange.cpp
index f335dd2a17..90dd7bd6c0 100644
--- a/SYCL/AtomicRef/exchange.cpp
+++ b/SYCL/AtomicRef/exchange.cpp
@@ -147,9 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host.
-// XFAIL: host
-
 #include "exchange.h"
 
 int main() { exchange_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/exchange.h b/SYCL/AtomicRef/exchange.h
index fd011322e2..b3d9b18efd 100644
--- a/SYCL/AtomicRef/exchange.h
+++ b/SYCL/AtomicRef/exchange.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -100,19 +104,27 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void exchange_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    exchange_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
-                        T, order, scope>(q, N);
-    exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order,
-                         scope>(q, N);
-  }
-  exchange_local_test<::sycl::atomic_ref, space_for_local_tests, T, order,
-                      scope>(q, N);
-  exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      exchange_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order,
+                          scope>(q, N);
+    }
+    exchange_local_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      exchange_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order,
+                           scope>(q, N);
+    }
+    exchange_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/exchange_generic_local.cpp b/SYCL/AtomicRef/exchange_generic_local.cpp
new file mode 100755
index 0000000000..5393007c83
--- /dev/null
+++ b/SYCL/AtomicRef/exchange_generic_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "exchange.h"
+
+int main() { exchange_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/exchange_local.cpp b/SYCL/AtomicRef/exchange_local.cpp
new file mode 100755
index 0000000000..7c02b3718a
--- /dev/null
+++ b/SYCL/AtomicRef/exchange_local.cpp
@@ -0,0 +1,155 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host.
+// XFAIL: host
+
+#include "exchange.h"
+
+int main() { exchange_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index be80ff4633..0d7bffcc31 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -147,10 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "max.h"
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
index 7d681cd7a6..c8416d3d10 100644
--- a/SYCL/AtomicRef/max.h
+++ b/SYCL/AtomicRef/max.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -109,19 +113,27 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void max_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    max_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                   order, scope>(q, N);
-    max_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
-                                                                             N);
-  }
-  max_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q,
-                                                                             N);
-  max_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      max_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q, N);
+    }
+    max_local_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      max_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q, N);
+    }
+    max_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/max_generic_local.cpp b/SYCL/AtomicRef/max_generic_local.cpp
new file mode 100755
index 0000000000..49fbf1a773
--- /dev/null
+++ b/SYCL/AtomicRef/max_generic_local.cpp
@@ -0,0 +1,159 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/max_local.cpp b/SYCL/AtomicRef/max_local.cpp
new file mode 100755
index 0000000000..9a395e82b8
--- /dev/null
+++ b/SYCL/AtomicRef/max_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index da7cf0a5ff..dd9752a016 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -147,10 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "min.h"
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
index 3daa0c5aa6..a05f48f009 100644
--- a/SYCL/AtomicRef/min.h
+++ b/SYCL/AtomicRef/min.h
@@ -1,4 +1,9 @@
 #pragma once
+
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -106,19 +111,27 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void min_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    min_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                   order, scope>(q, N);
-    min_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q,
-                                                                             N);
-  }
-  min_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q,
-                                                                             N);
-  min_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      min_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q, N);
+    }
+    min_local_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      min_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q, N);
+    }
+    min_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
+  }
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/min_generic_local.cpp b/SYCL/AtomicRef/min_generic_local.cpp
new file mode 100755
index 0000000000..8cb552cf81
--- /dev/null
+++ b/SYCL/AtomicRef/min_generic_local.cpp
@@ -0,0 +1,159 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/min_local.cpp b/SYCL/AtomicRef/min_local.cpp
new file mode 100755
index 0000000000..22dfc04998
--- /dev/null
+++ b/SYCL/AtomicRef/min_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/or.cpp b/SYCL/AtomicRef/or.cpp
index 70af8e7628..2fe706620f 100644
--- a/SYCL/AtomicRef/or.cpp
+++ b/SYCL/AtomicRef/or.cpp
@@ -147,9 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host.
-// XFAIL: host
-
 #include "or.h"
 
 int main() { or_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/or.h b/SYCL/AtomicRef/or.h
index 6f765bd1c3..2b18debfca 100755
--- a/SYCL/AtomicRef/or.h
+++ b/SYCL/AtomicRef/or.h
@@ -1,3 +1,9 @@
+#pragma once
+
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -96,17 +102,26 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void or_test(queue q) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    or_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                  order, scope>(q);
-    or_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      or_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+    }
+    or_local_test<::sycl::atomic_ref, space, T, order, scope>(q);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      or_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q);
+    }
+    or_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
   }
-  or_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  or_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/or_generic_local.cpp b/SYCL/AtomicRef/or_generic_local.cpp
new file mode 100755
index 0000000000..ccf603fea9
--- /dev/null
+++ b/SYCL/AtomicRef/or_generic_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "or.h"
+
+int main() { or_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/or_local.cpp b/SYCL/AtomicRef/or_local.cpp
new file mode 100755
index 0000000000..3b9a60e806
--- /dev/null
+++ b/SYCL/AtomicRef/or_local.cpp
@@ -0,0 +1,155 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host.
+// XFAIL: host
+
+#include "or.h"
+
+int main() { or_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 1a2398aae4..0e25ebb91b 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -147,10 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
 #define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "sub.h"
diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
index 8b992fa905..a422112b66 100644
--- a/SYCL/AtomicRef/sub.h
+++ b/SYCL/AtomicRef/sub.h
@@ -1,3 +1,9 @@
+#pragma once
+
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -230,35 +236,44 @@ template <access::address_space space, typename T, typename Difference = T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void sub_test(queue q, size_t N) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
   constexpr bool do_ext_tests = space != access::address_space::generic_space;
-  if constexpr (do_ext_tests) {
-    sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests,
-                         T, Difference, order, scope>(q, N);
-    sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference, order,
-                   scope>(q, N);
-    sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                         order, scope>(q, N);
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      sub_fetch_local_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                           Difference, order, scope>(q, N);
+    }
+    sub_fetch_local_test<::sycl::atomic_ref, space, T, Difference, order,
+                         scope>(q, N);
   }
-  sub_fetch_local_test<::sycl::atomic_ref, space_for_local_tests, T, Difference,
-                       order, scope>(q, N);
-  sub_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q, N);
-  sub_minus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
-      q, N);
-  if constexpr (!std::is_floating_point_v<T>) {
+  if constexpr (do_global_tests) {
     if constexpr (do_ext_tests) {
-      sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                       order, scope>(q, N);
-      sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                        order, scope>(q, N);
+      sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                     order, scope>(q, N);
+      sub_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                          order, scope>(q, N);
+      if constexpr (!std::is_floating_point_v<T>) {
+        sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                         order, scope>(q, N);
+        sub_post_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+                          order, scope>(q, N);
+      }
     }
-    sub_pre_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
-                                                                             N);
-    sub_post_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+    sub_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
+                                                                           N);
+    sub_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
         q, N);
+    if constexpr (!std::is_floating_point_v<T>) {
+      sub_pre_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+          q, N);
+      sub_post_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+          q, N);
+    }
   }
 }
 
diff --git a/SYCL/AtomicRef/sub_generic_local.cpp b/SYCL/AtomicRef/sub_generic_local.cpp
new file mode 100755
index 0000000000..d3927a336c
--- /dev/null
+++ b/SYCL/AtomicRef/sub_generic_local.cpp
@@ -0,0 +1,157 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub_local.cpp b/SYCL/AtomicRef/sub_local.cpp
new file mode 100755
index 0000000000..da8947cb75
--- /dev/null
+++ b/SYCL/AtomicRef/sub_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/xor.cpp b/SYCL/AtomicRef/xor.cpp
index 4b1ae4ef32..7227f39b18 100644
--- a/SYCL/AtomicRef/xor.cpp
+++ b/SYCL/AtomicRef/xor.cpp
@@ -147,9 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host.
-// XFAIL: host
-
 #include "xor.h"
 
 int main() { xor_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/xor.h b/SYCL/AtomicRef/xor.h
index d96335482d..6ffd55a3ce 100755
--- a/SYCL/AtomicRef/xor.h
+++ b/SYCL/AtomicRef/xor.h
@@ -1,3 +1,9 @@
+#pragma once
+
+#ifndef TEST_GENERIC_IN_LOCAL
+#define TEST_GENERIC_IN_LOCAL 0
+#endif
+
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -96,17 +102,27 @@ template <access::address_space space, typename T,
           memory_order order = memory_order::relaxed,
           memory_scope scope = memory_scope::device>
 void xor_test(queue q) {
-  constexpr access::address_space space_for_local_tests =
-      space == access::address_space::global_space
-          ? access::address_space::local_space
-          : space;
-  if constexpr (space != access::address_space::generic_space) {
-    xor_local_test<::sycl::ext::oneapi::atomic_ref, space_for_local_tests, T,
-                   order, scope>(q);
-    xor_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(q);
+  constexpr bool do_local_tests =
+      space == access::address_space::local_space ||
+      (space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_global_tests =
+      space == access::address_space::global_space ||
+      (space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
+  constexpr bool do_ext_tests = space != access::address_space::generic_space;
+  if constexpr (do_local_tests) {
+    if constexpr (do_ext_tests) {
+      xor_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q);
+    }
+    xor_local_test<::sycl::atomic_ref, space, T, order, scope>(q);
+  }
+  if constexpr (do_global_tests) {
+    if constexpr (do_ext_tests) {
+      xor_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
+          q);
+    }
+    xor_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
   }
-  xor_local_test<::sycl::atomic_ref, space_for_local_tests, T, order, scope>(q);
-  xor_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
 }
 
 template <access::address_space space, typename T,
diff --git a/SYCL/AtomicRef/xor_generic_local.cpp b/SYCL/AtomicRef/xor_generic_local.cpp
new file mode 100755
index 0000000000..863a02696c
--- /dev/null
+++ b/SYCL/AtomicRef/xor_generic_local.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "xor.h"
+
+int main() { xor_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/xor_local.cpp b/SYCL/AtomicRef/xor_local.cpp
new file mode 100755
index 0000000000..f1d44010b0
--- /dev/null
+++ b/SYCL/AtomicRef/xor_local.cpp
@@ -0,0 +1,155 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host.
+// XFAIL: host
+
+#include "xor.h"
+
+int main() { xor_test_all<access::address_space::local_space>(); }

From 55795d320ea4af0aa79d7c3bcb94b888defdee62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Wed, 16 Feb 2022 08:50:31 +0000
Subject: [PATCH 22/27] fix copy paste error in sub

---
 SYCL/AtomicRef/sub.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
index a422112b66..baf2a9aaa4 100644
--- a/SYCL/AtomicRef/sub.h
+++ b/SYCL/AtomicRef/sub.h
@@ -255,7 +255,7 @@ void sub_test(queue q, size_t N) {
     if constexpr (do_ext_tests) {
       sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
                      order, scope>(q, N);
-      sub_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
+      sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
                           order, scope>(q, N);
       if constexpr (!std::is_floating_point_v<T>) {
         sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
@@ -266,7 +266,7 @@ void sub_test(queue q, size_t N) {
     }
     sub_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
                                                                            N);
-    sub_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
+    sub_minus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
         q, N);
     if constexpr (!std::is_floating_point_v<T>) {
       sub_pre_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(

From 9a23a3420f0092a398d766348a2f68b170f5c36a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Fri, 18 Feb 2022 13:50:03 +0000
Subject: [PATCH 23/27] split native floating point tests

---
 SYCL/AtomicRef/add.cpp                        |   2 -
 SYCL/AtomicRef/add.h                          |   6 +-
 SYCL/AtomicRef/add_generic.cpp                |   2 -
 SYCL/AtomicRef/add_generic_local.cpp          |   1 -
 .../AtomicRef/add_generic_local_native_fp.cpp | 163 ++++++++++++++++++
 SYCL/AtomicRef/add_generic_native_fp.cpp      | 161 +++++++++++++++++
 SYCL/AtomicRef/add_local.cpp                  |   2 -
 SYCL/AtomicRef/add_local_native_fp.cpp        | 162 +++++++++++++++++
 SYCL/AtomicRef/add_native_fp.cpp              | 158 +++++++++++++++++
 SYCL/AtomicRef/max.cpp                        |   2 -
 SYCL/AtomicRef/max.h                          |   6 +-
 SYCL/AtomicRef/max_generic.cpp                |   2 -
 SYCL/AtomicRef/max_generic_local.cpp          |   1 -
 .../AtomicRef/max_generic_local_native_fp.cpp | 163 ++++++++++++++++++
 SYCL/AtomicRef/max_generic_native_fp.cpp      | 161 +++++++++++++++++
 SYCL/AtomicRef/max_local.cpp                  |   2 -
 SYCL/AtomicRef/max_local_native_fp.cpp        | 162 +++++++++++++++++
 SYCL/AtomicRef/max_native_fp.cpp              | 158 +++++++++++++++++
 SYCL/AtomicRef/min.cpp                        |   2 -
 SYCL/AtomicRef/min.h                          |   6 +-
 SYCL/AtomicRef/min_generic.cpp                |   2 -
 SYCL/AtomicRef/min_generic_local.cpp          |   1 -
 .../AtomicRef/min_generic_local_native_fp.cpp | 163 ++++++++++++++++++
 SYCL/AtomicRef/min_generic_native_fp.cpp      | 161 +++++++++++++++++
 SYCL/AtomicRef/min_local.cpp                  |   2 -
 SYCL/AtomicRef/min_local_native_fp.cpp        | 162 +++++++++++++++++
 SYCL/AtomicRef/min_native_fp.cpp              | 158 +++++++++++++++++
 SYCL/AtomicRef/sub.cpp                        |   2 -
 SYCL/AtomicRef/sub.h                          |  14 +-
 SYCL/AtomicRef/sub_generic.cpp                |   2 -
 SYCL/AtomicRef/sub_generic_local.cpp          |   1 -
 .../AtomicRef/sub_generic_local_native_fp.cpp | 161 +++++++++++++++++
 SYCL/AtomicRef/sub_generic_native_fp.cpp      | 159 +++++++++++++++++
 SYCL/AtomicRef/sub_local.cpp                  |   2 -
 SYCL/AtomicRef/sub_local_native_fp.cpp        | 162 +++++++++++++++++
 SYCL/AtomicRef/sub_native_fp.cpp              | 158 +++++++++++++++++
 36 files changed, 2596 insertions(+), 36 deletions(-)
 create mode 100755 SYCL/AtomicRef/add_generic_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/add_generic_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/add_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/add_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/max_generic_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/max_generic_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/max_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/max_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/min_generic_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/min_generic_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/min_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/min_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/sub_generic_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/sub_generic_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/sub_local_native_fp.cpp
 create mode 100755 SYCL/AtomicRef/sub_native_fp.cpp

diff --git a/SYCL/AtomicRef/add.cpp b/SYCL/AtomicRef/add.cpp
index 095ac0290f..3362e5c568 100644
--- a/SYCL/AtomicRef/add.cpp
+++ b/SYCL/AtomicRef/add.cpp
@@ -147,8 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "add.h"
 
 int main() { add_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/add.h b/SYCL/AtomicRef/add.h
index 57eab02f4f..10b976c2c5 100644
--- a/SYCL/AtomicRef/add.h
+++ b/SYCL/AtomicRef/add.h
@@ -348,6 +348,7 @@ template <access::address_space space> void add_test_all() {
   }
 
   add_test_orders_scopes<space, double>(q, N);
+#ifndef FP_TESTS_ONLY
   if constexpr (sizeof(long) == 8) {
     add_test_orders_scopes<space, long>(q, N);
     add_test_orders_scopes<space, unsigned long>(q, N);
@@ -359,9 +360,11 @@ template <access::address_space space> void add_test_all() {
   if constexpr (sizeof(char *) == 8) {
     add_test_orders_scopes<space, char *, ptrdiff_t>(q, N);
   }
+#endif
 #else
-  add_test_orders_scopes<space, int>(q, N);
   add_test_orders_scopes<space, float>(q, N);
+#ifndef FP_TESTS_ONLY
+  add_test_orders_scopes<space, int>(q, N);
   add_test_orders_scopes<space, unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
     add_test_orders_scopes<space, long>(q, N);
@@ -370,6 +373,7 @@ template <access::address_space space> void add_test_all() {
   if constexpr (sizeof(char *) == 4) {
     add_test_orders_scopes<space, char *, ptrdiff_t>(q, N);
   }
+#endif
 #endif
 
   std::cout << "Test passed." << std::endl;
diff --git a/SYCL/AtomicRef/add_generic.cpp b/SYCL/AtomicRef/add_generic.cpp
index 9dc1534cc7..14c956f53c 100644
--- a/SYCL/AtomicRef/add_generic.cpp
+++ b/SYCL/AtomicRef/add_generic.cpp
@@ -150,8 +150,6 @@
 // CUDA and HIP backends have had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "add.h"
 
 int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/add_generic_local.cpp b/SYCL/AtomicRef/add_generic_local.cpp
index 2253fd8cbb..5edb2d3248 100755
--- a/SYCL/AtomicRef/add_generic_local.cpp
+++ b/SYCL/AtomicRef/add_generic_local.cpp
@@ -152,7 +152,6 @@
 // XFAIL: cuda || hip || host
 
 #define TEST_GENERIC_IN_LOCAL 1
-#define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "add.h"
 
diff --git a/SYCL/AtomicRef/add_generic_local_native_fp.cpp b/SYCL/AtomicRef/add_generic_local_native_fp.cpp
new file mode 100755
index 0000000000..11cbd548b0
--- /dev/null
+++ b/SYCL/AtomicRef/add_generic_local_native_fp.cpp
@@ -0,0 +1,163 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/add_generic_native_fp.cpp b/SYCL/AtomicRef/add_generic_native_fp.cpp
new file mode 100755
index 0000000000..251ad9a39f
--- /dev/null
+++ b/SYCL/AtomicRef/add_generic_native_fp.cpp
@@ -0,0 +1,161 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/add_local.cpp b/SYCL/AtomicRef/add_local.cpp
index bac4fd284c..b774806d69 100755
--- a/SYCL/AtomicRef/add_local.cpp
+++ b/SYCL/AtomicRef/add_local.cpp
@@ -151,8 +151,6 @@
 // point atomics.
 // XFAIL: host, hip, opencl
 
-//#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "add.h"
 
 int main() { add_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/add_local_native_fp.cpp b/SYCL/AtomicRef/add_local_native_fp.cpp
new file mode 100755
index 0000000000..b8e1cdcaac
--- /dev/null
+++ b/SYCL/AtomicRef/add_local_native_fp.cpp
@@ -0,0 +1,162 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/add_native_fp.cpp b/SYCL/AtomicRef/add_native_fp.cpp
new file mode 100755
index 0000000000..37c3a2b66e
--- /dev/null
+++ b/SYCL/AtomicRef/add_native_fp.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "add.h"
+
+int main() { add_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/max.cpp b/SYCL/AtomicRef/max.cpp
index 0d7bffcc31..fce5b5c8ae 100644
--- a/SYCL/AtomicRef/max.cpp
+++ b/SYCL/AtomicRef/max.cpp
@@ -147,8 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "max.h"
 
 int main() { max_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/max.h b/SYCL/AtomicRef/max.h
index c8416d3d10..a103dc88d4 100644
--- a/SYCL/AtomicRef/max.h
+++ b/SYCL/AtomicRef/max.h
@@ -207,6 +207,7 @@ template <access::address_space space> void max_test_all() {
   }
 
   max_test_orders_scopes<space, double>(q, N);
+#ifndef FP_TESTS_ONLY
   if constexpr (sizeof(long) == 8) {
     max_test_orders_scopes<space, long>(q, N);
     max_test_orders_scopes<space, unsigned long>(q, N);
@@ -215,14 +216,17 @@ template <access::address_space space> void max_test_all() {
     max_test_orders_scopes<space, long long>(q, N);
     max_test_orders_scopes<space, unsigned long long>(q, N);
   }
+#endif
 #else
-  max_test_orders_scopes<space, int>(q, N);
   max_test_orders_scopes<space, float>(q, N);
+#ifndef FP_TESTS_ONLY
+  max_test_orders_scopes<space, int>(q, N);
   max_test_orders_scopes<space, unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
     max_test_orders_scopes<space, long>(q, N);
     max_test_orders_scopes<space, unsigned long>(q, N);
   }
+#endif
 #endif
 
   std::cout << "Test passed." << std::endl;
diff --git a/SYCL/AtomicRef/max_generic.cpp b/SYCL/AtomicRef/max_generic.cpp
index db8aea34f2..7e65d9887b 100644
--- a/SYCL/AtomicRef/max_generic.cpp
+++ b/SYCL/AtomicRef/max_generic.cpp
@@ -150,8 +150,6 @@
 // CUDA and HIP backends have had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "max.h"
 
 int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/max_generic_local.cpp b/SYCL/AtomicRef/max_generic_local.cpp
index 49fbf1a773..eb488a2e40 100755
--- a/SYCL/AtomicRef/max_generic_local.cpp
+++ b/SYCL/AtomicRef/max_generic_local.cpp
@@ -152,7 +152,6 @@
 // XFAIL: cuda || hip || host
 
 #define TEST_GENERIC_IN_LOCAL 1
-#define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "max.h"
 
diff --git a/SYCL/AtomicRef/max_generic_local_native_fp.cpp b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
new file mode 100755
index 0000000000..bd39bd86c9
--- /dev/null
+++ b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
@@ -0,0 +1,163 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/max_generic_native_fp.cpp b/SYCL/AtomicRef/max_generic_native_fp.cpp
new file mode 100755
index 0000000000..9f4420f630
--- /dev/null
+++ b/SYCL/AtomicRef/max_generic_native_fp.cpp
@@ -0,0 +1,161 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/max_local.cpp b/SYCL/AtomicRef/max_local.cpp
index 9a395e82b8..03438881d4 100755
--- a/SYCL/AtomicRef/max_local.cpp
+++ b/SYCL/AtomicRef/max_local.cpp
@@ -151,8 +151,6 @@
 // point atomics.
 // XFAIL: host, hip, opencl
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "max.h"
 
 int main() { max_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/max_local_native_fp.cpp b/SYCL/AtomicRef/max_local_native_fp.cpp
new file mode 100755
index 0000000000..e3f9a65f9a
--- /dev/null
+++ b/SYCL/AtomicRef/max_local_native_fp.cpp
@@ -0,0 +1,162 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/max_native_fp.cpp b/SYCL/AtomicRef/max_native_fp.cpp
new file mode 100755
index 0000000000..3c57e294c9
--- /dev/null
+++ b/SYCL/AtomicRef/max_native_fp.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "max.h"
+
+int main() { max_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/min.cpp b/SYCL/AtomicRef/min.cpp
index dd9752a016..843f25bac4 100644
--- a/SYCL/AtomicRef/min.cpp
+++ b/SYCL/AtomicRef/min.cpp
@@ -147,8 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "min.h"
 
 int main() { min_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/min.h b/SYCL/AtomicRef/min.h
index a05f48f009..d7e00f16a6 100644
--- a/SYCL/AtomicRef/min.h
+++ b/SYCL/AtomicRef/min.h
@@ -206,6 +206,7 @@ template <access::address_space space> void min_test_all() {
   }
 
   min_test_orders_scopes<space, double>(q, N);
+#ifndef FP_TESTS_ONLY
   if constexpr (sizeof(long) == 8) {
     min_test_orders_scopes<space, long>(q, N);
     min_test_orders_scopes<space, unsigned long>(q, N);
@@ -214,14 +215,17 @@ template <access::address_space space> void min_test_all() {
     min_test_orders_scopes<space, long long>(q, N);
     min_test_orders_scopes<space, unsigned long long>(q, N);
   }
+#endif
 #else
-  min_test_orders_scopes<space, int>(q, N);
   min_test_orders_scopes<space, float>(q, N);
+#ifndef FP_TESTS_ONLY
+  min_test_orders_scopes<space, int>(q, N);
   min_test_orders_scopes<space, unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
     min_test_orders_scopes<space, long>(q, N);
     min_test_orders_scopes<space, unsigned long>(q, N);
   }
+#endif
 #endif
 
   std::cout << "Test passed." << std::endl;
diff --git a/SYCL/AtomicRef/min_generic.cpp b/SYCL/AtomicRef/min_generic.cpp
index ecc0ef8089..b839e94b68 100644
--- a/SYCL/AtomicRef/min_generic.cpp
+++ b/SYCL/AtomicRef/min_generic.cpp
@@ -150,8 +150,6 @@
 // CUDA and HIP backends have had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "min.h"
 
 int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/min_generic_local.cpp b/SYCL/AtomicRef/min_generic_local.cpp
index 8cb552cf81..6772930c19 100755
--- a/SYCL/AtomicRef/min_generic_local.cpp
+++ b/SYCL/AtomicRef/min_generic_local.cpp
@@ -152,7 +152,6 @@
 // XFAIL: cuda || hip || host
 
 #define TEST_GENERIC_IN_LOCAL 1
-#define SYCL_USE_NATIVE_FP_ATOMICS
 
 #include "min.h"
 
diff --git a/SYCL/AtomicRef/min_generic_local_native_fp.cpp b/SYCL/AtomicRef/min_generic_local_native_fp.cpp
new file mode 100755
index 0000000000..0769371daf
--- /dev/null
+++ b/SYCL/AtomicRef/min_generic_local_native_fp.cpp
@@ -0,0 +1,163 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+#define TEST_GENERIC_IN_LOCAL 1
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/min_generic_native_fp.cpp b/SYCL/AtomicRef/min_generic_native_fp.cpp
new file mode 100755
index 0000000000..8105029dc6
--- /dev/null
+++ b/SYCL/AtomicRef/min_generic_native_fp.cpp
@@ -0,0 +1,161 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/min_local.cpp b/SYCL/AtomicRef/min_local.cpp
index 22dfc04998..67b5a70e17 100755
--- a/SYCL/AtomicRef/min_local.cpp
+++ b/SYCL/AtomicRef/min_local.cpp
@@ -151,8 +151,6 @@
 // point atomics.
 // XFAIL: host, hip, opencl
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "min.h"
 
 int main() { min_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/min_local_native_fp.cpp b/SYCL/AtomicRef/min_local_native_fp.cpp
new file mode 100755
index 0000000000..46498669ff
--- /dev/null
+++ b/SYCL/AtomicRef/min_local_native_fp.cpp
@@ -0,0 +1,162 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/min_native_fp.cpp b/SYCL/AtomicRef/min_native_fp.cpp
new file mode 100755
index 0000000000..acdf0c4d00
--- /dev/null
+++ b/SYCL/AtomicRef/min_native_fp.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "min.h"
+
+int main() { min_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub.cpp b/SYCL/AtomicRef/sub.cpp
index 0e25ebb91b..bac2c3416d 100644
--- a/SYCL/AtomicRef/sub.cpp
+++ b/SYCL/AtomicRef/sub.cpp
@@ -147,8 +147,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "sub.h"
 
 int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub.h b/SYCL/AtomicRef/sub.h
index baf2a9aaa4..24e054f8b8 100644
--- a/SYCL/AtomicRef/sub.h
+++ b/SYCL/AtomicRef/sub.h
@@ -255,8 +255,8 @@ void sub_test(queue q, size_t N) {
     if constexpr (do_ext_tests) {
       sub_fetch_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
                      order, scope>(q, N);
-      sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
-                          order, scope>(q, N);
+      sub_minus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T,
+                           Difference, order, scope>(q, N);
       if constexpr (!std::is_floating_point_v<T>) {
         sub_pre_dec_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
                          order, scope>(q, N);
@@ -266,8 +266,8 @@ void sub_test(queue q, size_t N) {
     }
     sub_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
                                                                            N);
-    sub_minus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
-        q, N);
+    sub_minus_equal_test<::sycl::atomic_ref, space, T, Difference, order,
+                         scope>(q, N);
     if constexpr (!std::is_floating_point_v<T>) {
       sub_pre_dec_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
           q, N);
@@ -349,6 +349,7 @@ template <access::address_space space> void sub_test_all() {
   }
 
   sub_test_orders_scopes<space, double>(q, N);
+#ifndef FP_TESTS_ONLY
   if constexpr (sizeof(long) == 8) {
     sub_test_orders_scopes<space, long>(q, N);
     sub_test_orders_scopes<space, unsigned long>(q, N);
@@ -360,9 +361,11 @@ template <access::address_space space> void sub_test_all() {
   if constexpr (sizeof(char *) == 8) {
     sub_test_orders_scopes<space, char *, ptrdiff_t>(q, N);
   }
+#endif
 #else
-  sub_test_orders_scopes<space, int>(q, N);
   sub_test_orders_scopes<space, float>(q, N);
+#ifndef FP_TESTS_ONLY
+  sub_test_orders_scopes<space, int>(q, N);
   sub_test_orders_scopes<space, unsigned int>(q, N);
   if constexpr (sizeof(long) == 4) {
     sub_test_orders_scopes<space, long>(q, N);
@@ -371,6 +374,7 @@ template <access::address_space space> void sub_test_all() {
   if constexpr (sizeof(char *) == 4) {
     sub_test_orders_scopes<space, char *, ptrdiff_t>(q, N);
   }
+#endif
 #endif
 
   std::cout << "Test passed." << std::endl;
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index a9af04bd87..787d4d7a5b 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -150,6 +150,4 @@
 // CUDA and HIP backends have had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_local.cpp b/SYCL/AtomicRef/sub_generic_local.cpp
index d3927a336c..d4c1a105df 100755
--- a/SYCL/AtomicRef/sub_generic_local.cpp
+++ b/SYCL/AtomicRef/sub_generic_local.cpp
@@ -152,6 +152,5 @@
 // XFAIL: cuda || hip || host
 
 #define TEST_GENERIC_IN_LOCAL 1
-#define SYCL_USE_NATIVE_FP_ATOMICS
 
 int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_local_native_fp.cpp b/SYCL/AtomicRef/sub_generic_local_native_fp.cpp
new file mode 100755
index 0000000000..ac713c94d4
--- /dev/null
+++ b/SYCL/AtomicRef/sub_generic_local_native_fp.cpp
@@ -0,0 +1,161 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet.
+// Host does not support barrier.
+// XFAIL: cuda || hip || host
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+#define TEST_GENERIC_IN_LOCAL 1
+
+int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_native_fp.cpp b/SYCL/AtomicRef/sub_generic_native_fp.cpp
new file mode 100755
index 0000000000..6ec2299d78
--- /dev/null
+++ b/SYCL/AtomicRef/sub_generic_native_fp.cpp
@@ -0,0 +1,159 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// CUDA and HIP backends have had no support for the generic address space yet
+// XFAIL: cuda || hip
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/SYCL/AtomicRef/sub_local.cpp b/SYCL/AtomicRef/sub_local.cpp
index da8947cb75..9499677a4f 100755
--- a/SYCL/AtomicRef/sub_local.cpp
+++ b/SYCL/AtomicRef/sub_local.cpp
@@ -151,8 +151,6 @@
 // point atomics.
 // XFAIL: host, hip, opencl
 
-#define SYCL_USE_NATIVE_FP_ATOMICS
-
 #include "sub.h"
 
 int main() { sub_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/sub_local_native_fp.cpp b/SYCL/AtomicRef/sub_local_native_fp.cpp
new file mode 100755
index 0000000000..33ab5d8cbf
--- /dev/null
+++ b/SYCL/AtomicRef/sub_local_native_fp.cpp
@@ -0,0 +1,162 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Barrier is not supported on host. HIP and OpenCL do not support floating
+// point atomics.
+// XFAIL: host, hip, opencl
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::local_space>(); }
diff --git a/SYCL/AtomicRef/sub_native_fp.cpp b/SYCL/AtomicRef/sub_native_fp.cpp
new file mode 100755
index 0000000000..a6c519edb6
--- /dev/null
+++ b/SYCL/AtomicRef/sub_native_fp.cpp
@@ -0,0 +1,158 @@
+// Each combination of 64/32 bit atomic, relaxed/acquire/release/acq_rel
+// semantic order and sub_group/work_group/device/system scope is tested
+// separately. This is controlled by macros, defined by RUN commands. Defaults
+// (no macro for a group) are: 32 bit, relaxed and device.
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQUIRE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DRELEASE -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DWORK_GROUP -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 -DACQ_REL -DSYSTEM -DATOMIC64
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// hip, level zero and opencl do not support native floating point atomics
+// XFAIL: hip, level_zero, opencl
+
+#define SYCL_USE_NATIVE_FP_ATOMICS
+#define FP_TESTS_ONLY
+
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::global_space>(); }

From ad770106ddbf145d4f1448085a125ece1befe0d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Mon, 21 Feb 2022 15:43:23 +0000
Subject: [PATCH 24/27] fixed sub_generic tests and XFAILS for native fp tests

---
 SYCL/AtomicRef/add_generic_local_native_fp.cpp |  8 +++-----
 SYCL/AtomicRef/add_generic_native_fp.cpp       |  8 +++-----
 SYCL/AtomicRef/add_local_native_fp.cpp         |  9 +++------
 SYCL/AtomicRef/add_native_fp.cpp               |  4 ++--
 SYCL/AtomicRef/max_generic_local_native_fp.cpp |  7 ++-----
 SYCL/AtomicRef/max_generic_native_fp.cpp       |  8 +++-----
 SYCL/AtomicRef/max_local_native_fp.cpp         |  8 ++------
 SYCL/AtomicRef/max_native_fp.cpp               |  4 ++--
 SYCL/AtomicRef/min_generic_local_native_fp.cpp |  8 +++-----
 SYCL/AtomicRef/min_generic_native_fp.cpp       |  8 +++-----
 SYCL/AtomicRef/min_local_native_fp.cpp         |  9 +++------
 SYCL/AtomicRef/min_native_fp.cpp               |  4 ++--
 SYCL/AtomicRef/sub_generic.cpp                 |  4 +++-
 SYCL/AtomicRef/sub_generic_local.cpp           |  3 ++-
 SYCL/AtomicRef/sub_generic_local_native_fp.cpp | 12 ++++++------
 SYCL/AtomicRef/sub_generic_native_fp.cpp       | 12 ++++++------
 SYCL/AtomicRef/sub_local_native_fp.cpp         |  9 +++------
 SYCL/AtomicRef/sub_native_fp.cpp               |  4 ++--
 18 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/SYCL/AtomicRef/add_generic_local_native_fp.cpp b/SYCL/AtomicRef/add_generic_local_native_fp.cpp
index 11cbd548b0..bb0c0672b5 100755
--- a/SYCL/AtomicRef/add_generic_local_native_fp.cpp
+++ b/SYCL/AtomicRef/add_generic_local_native_fp.cpp
@@ -148,11 +148,9 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA and HIP backends have had no support for the generic address space yet.
-// Host does not support barrier.
-// XFAIL: cuda || hip || host
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Host does not support barrier. HIP does not support native floating point
+// atomics
+// XFAIL: cuda, hip, host
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/add_generic_native_fp.cpp b/SYCL/AtomicRef/add_generic_native_fp.cpp
index 251ad9a39f..002480f6ec 100755
--- a/SYCL/AtomicRef/add_generic_native_fp.cpp
+++ b/SYCL/AtomicRef/add_generic_native_fp.cpp
@@ -147,11 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA and HIP backends have had no support for the generic address space yet
-// XFAIL: cuda || hip
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// CUDA and HIP backends have had no support for the generic address space yet.
+// HIP does not support native floating point atomics
+// XFAIL: cuda, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/add_local_native_fp.cpp b/SYCL/AtomicRef/add_local_native_fp.cpp
index b8e1cdcaac..82c10d0649 100755
--- a/SYCL/AtomicRef/add_local_native_fp.cpp
+++ b/SYCL/AtomicRef/add_local_native_fp.cpp
@@ -147,12 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP does not support floating point
+// atomics.
+// XFAIL: host, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/add_native_fp.cpp b/SYCL/AtomicRef/add_native_fp.cpp
index 37c3a2b66e..e25327e7ab 100755
--- a/SYCL/AtomicRef/add_native_fp.cpp
+++ b/SYCL/AtomicRef/add_native_fp.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// hip does not support native floating point atomics
+// XFAIL: hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/max_generic_local_native_fp.cpp b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
index bd39bd86c9..3b7424e44c 100755
--- a/SYCL/AtomicRef/max_generic_local_native_fp.cpp
+++ b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
@@ -148,11 +148,8 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA and HIP backends have had no support for the generic address space yet.
-// Host does not support barrier.
-// XFAIL: cuda || hip || host
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Host does not support barrier. HIP dees not support native floating point atomics
+// XFAIL: cuda, hip, host
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/max_generic_native_fp.cpp b/SYCL/AtomicRef/max_generic_native_fp.cpp
index 9f4420f630..40001619ca 100755
--- a/SYCL/AtomicRef/max_generic_native_fp.cpp
+++ b/SYCL/AtomicRef/max_generic_native_fp.cpp
@@ -147,11 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA and HIP backends have had no support for the generic address space yet
-// XFAIL: cuda || hip
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// CUDA and HIP backends have had no support for the generic address space yet.
+// HIP does not support native floating point atomics
+// XFAIL: cuda, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/max_local_native_fp.cpp b/SYCL/AtomicRef/max_local_native_fp.cpp
index e3f9a65f9a..80404c21d3 100755
--- a/SYCL/AtomicRef/max_local_native_fp.cpp
+++ b/SYCL/AtomicRef/max_local_native_fp.cpp
@@ -147,12 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP does not support native floating point atomics
+// XFAIL: host, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/max_native_fp.cpp b/SYCL/AtomicRef/max_native_fp.cpp
index 3c57e294c9..30a2364344 100755
--- a/SYCL/AtomicRef/max_native_fp.cpp
+++ b/SYCL/AtomicRef/max_native_fp.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// HIP does not support native floating point atomics
+// XFAIL: hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/min_generic_local_native_fp.cpp b/SYCL/AtomicRef/min_generic_local_native_fp.cpp
index 0769371daf..31dec84ab7 100755
--- a/SYCL/AtomicRef/min_generic_local_native_fp.cpp
+++ b/SYCL/AtomicRef/min_generic_local_native_fp.cpp
@@ -148,11 +148,9 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA and HIP backends have had no support for the generic address space yet.
-// Host does not support barrier.
-// XFAIL: cuda || hip || host
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Host does not support barrier. HIP does not support native floating point
+// atomics
+// XFAIL: cuda, hip, host
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/min_generic_native_fp.cpp b/SYCL/AtomicRef/min_generic_native_fp.cpp
index 8105029dc6..3366a7835a 100755
--- a/SYCL/AtomicRef/min_generic_native_fp.cpp
+++ b/SYCL/AtomicRef/min_generic_native_fp.cpp
@@ -147,11 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA and HIP backends have had no support for the generic address space yet
-// XFAIL: cuda || hip
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// CUDA and HIP backends have had no support for the generic address space yet.
+// HIP does not support native floating point atomics
+// XFAIL: cuda, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/min_local_native_fp.cpp b/SYCL/AtomicRef/min_local_native_fp.cpp
index 46498669ff..9660dbad45 100755
--- a/SYCL/AtomicRef/min_local_native_fp.cpp
+++ b/SYCL/AtomicRef/min_local_native_fp.cpp
@@ -147,12 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP does not support native floating point
+// atomics
+// XFAIL: host, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/min_native_fp.cpp b/SYCL/AtomicRef/min_native_fp.cpp
index acdf0c4d00..c7f3b06408 100755
--- a/SYCL/AtomicRef/min_native_fp.cpp
+++ b/SYCL/AtomicRef/min_native_fp.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// HIP does not support native floating point atomics
+// XFAIL: hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/sub_generic.cpp b/SYCL/AtomicRef/sub_generic.cpp
index 787d4d7a5b..ad7a287567 100644
--- a/SYCL/AtomicRef/sub_generic.cpp
+++ b/SYCL/AtomicRef/sub_generic.cpp
@@ -150,4 +150,6 @@
 // CUDA and HIP backends have had no support for the generic address space yet
 // XFAIL: cuda || hip
 
-int main() { sub_test_all<access::address_space::global_space>(); }
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_local.cpp b/SYCL/AtomicRef/sub_generic_local.cpp
index d4c1a105df..c283a5b5a6 100755
--- a/SYCL/AtomicRef/sub_generic_local.cpp
+++ b/SYCL/AtomicRef/sub_generic_local.cpp
@@ -152,5 +152,6 @@
 // XFAIL: cuda || hip || host
 
 #define TEST_GENERIC_IN_LOCAL 1
+#include "sub.h"
 
-int main() { sub_test_all<access::address_space::global_space>(); }
+int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_local_native_fp.cpp b/SYCL/AtomicRef/sub_generic_local_native_fp.cpp
index ac713c94d4..da5c5c5dd3 100755
--- a/SYCL/AtomicRef/sub_generic_local_native_fp.cpp
+++ b/SYCL/AtomicRef/sub_generic_local_native_fp.cpp
@@ -148,14 +148,14 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA and HIP backends have had no support for the generic address space yet.
-// Host does not support barrier.
-// XFAIL: cuda || hip || host
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Host does not support barrier. HIP does not support native floating point
+// atomics
+// XFAIL: cuda, hip, host
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
 #define TEST_GENERIC_IN_LOCAL 1
 
-int main() { sub_test_all<access::address_space::global_space>(); }
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/sub_generic_native_fp.cpp b/SYCL/AtomicRef/sub_generic_native_fp.cpp
index 6ec2299d78..3e35395510 100755
--- a/SYCL/AtomicRef/sub_generic_native_fp.cpp
+++ b/SYCL/AtomicRef/sub_generic_native_fp.cpp
@@ -147,13 +147,13 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// CUDA and HIP backends have had no support for the generic address space yet
-// XFAIL: cuda || hip
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// CUDA and HIP backends have had no support for the generic address space yet.
+// HIP does not support native floating point atomics
+// XFAIL: cuda, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
 
-int main() { sub_test_all<access::address_space::global_space>(); }
+#include "sub.h"
+
+int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/SYCL/AtomicRef/sub_local_native_fp.cpp b/SYCL/AtomicRef/sub_local_native_fp.cpp
index 33ab5d8cbf..607f1e3eb0 100755
--- a/SYCL/AtomicRef/sub_local_native_fp.cpp
+++ b/SYCL/AtomicRef/sub_local_native_fp.cpp
@@ -147,12 +147,9 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP and OpenCL do not support floating
-// point atomics.
-// XFAIL: host, hip, opencl
-
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// Barrier is not supported on host. HIP does not support native floating point
+// atomics
+// XFAIL: host, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY
diff --git a/SYCL/AtomicRef/sub_native_fp.cpp b/SYCL/AtomicRef/sub_native_fp.cpp
index a6c519edb6..79f1c68004 100755
--- a/SYCL/AtomicRef/sub_native_fp.cpp
+++ b/SYCL/AtomicRef/sub_native_fp.cpp
@@ -147,8 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// hip, level zero and opencl do not support native floating point atomics
-// XFAIL: hip, level_zero, opencl
+// HIP does not support native floating point atomics
+// XFAIL: hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
 #define FP_TESTS_ONLY

From 8bae70f69fccf33d2837bbe4e7411dcdfd2e765c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Mon, 21 Feb 2022 15:51:58 +0000
Subject: [PATCH 25/27] format

---
 SYCL/AtomicRef/max_generic_local_native_fp.cpp | 3 ++-
 SYCL/AtomicRef/max_local_native_fp.cpp         | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/SYCL/AtomicRef/max_generic_local_native_fp.cpp b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
index 3b7424e44c..c6c418c6b9 100755
--- a/SYCL/AtomicRef/max_generic_local_native_fp.cpp
+++ b/SYCL/AtomicRef/max_generic_local_native_fp.cpp
@@ -148,7 +148,8 @@
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 // CUDA and HIP backends have had no support for the generic address space yet.
-// Host does not support barrier. HIP dees not support native floating point atomics
+// Host does not support barrier. HIP dees not support native floating point
+// atomics
 // XFAIL: cuda, hip, host
 
 #define SYCL_USE_NATIVE_FP_ATOMICS
diff --git a/SYCL/AtomicRef/max_local_native_fp.cpp b/SYCL/AtomicRef/max_local_native_fp.cpp
index 80404c21d3..5d325d86be 100755
--- a/SYCL/AtomicRef/max_local_native_fp.cpp
+++ b/SYCL/AtomicRef/max_local_native_fp.cpp
@@ -147,7 +147,8 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// Barrier is not supported on host. HIP does not support native floating point atomics
+// Barrier is not supported on host. HIP does not support native floating point
+// atomics
 // XFAIL: host, hip
 
 #define SYCL_USE_NATIVE_FP_ATOMICS

From 19c385620aca9fb2dfca029e2285377e610e7640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Tue, 22 Feb 2022 12:40:51 +0000
Subject: [PATCH 26/27] disable event_profiling_info for CUDA

---
 SYCL/Basic/event_profiling_info.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/SYCL/Basic/event_profiling_info.cpp b/SYCL/Basic/event_profiling_info.cpp
index a36f7902f1..12318ccced 100644
--- a/SYCL/Basic/event_profiling_info.cpp
+++ b/SYCL/Basic/event_profiling_info.cpp
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Flaky with CUDA
+// UNSUPORTED: cuda
+
 #include <CL/sycl.hpp>
 #include <cassert>
 

From 6a4fa770d463b7ceeefd6edc85e102e212497f00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.c@gmail.com>
Date: Tue, 22 Feb 2022 13:44:48 +0100
Subject: [PATCH 27/27] Update SYCL/Basic/event_profiling_info.cpp

Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 SYCL/Basic/event_profiling_info.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/Basic/event_profiling_info.cpp b/SYCL/Basic/event_profiling_info.cpp
index 12318ccced..6fb5ff4155 100644
--- a/SYCL/Basic/event_profiling_info.cpp
+++ b/SYCL/Basic/event_profiling_info.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 // Flaky with CUDA
-// UNSUPORTED: cuda
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <cassert>