diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc index 8ed8c738651e4..45bd67818db6a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc @@ -98,7 +98,8 @@ T ldg(const T* ptr); `ldg` returns the data of type `T` located at address `ptr`. When called from the `ext_oneapi_cuda` backend the data is cached in the read-only texture cache. When called from any other backend a copy of the data stored at address `ptr` is returned without using any special cache. -The template parameter `T` can be one of `char`, `signed char`, `short`, `int`, `long`, `long long`, `unsigned char`, `unsigned short`, `unsigned int`, `unsigned long`, `unsigned long long`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `vec`, `float`, `vec`, `vec`, `double`, or `vec`. +The template parameter `T` can be one of `char`, `signed char`, `short`, `int`, `long`, `long long`, `unsigned char`, `unsigned short`, `unsigned int`, `unsigned long`, `unsigned long long`, `half`, `float`, `double`, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, ``, or ``. + === Example of usage diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp index cffeb6d855bb9..c09665d3f59cd 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp @@ -27,10 +27,23 @@ namespace cuda { namespace detail { using ldg_vector_types = sycl::detail::type_list< - sycl::char2, sycl::char4, sycl::short2, sycl::short4, sycl::int2, - sycl::int4, sycl::longlong2, sycl::uchar2, sycl::uchar4, sycl::ushort2, - sycl::ushort4, sycl::uint2, sycl::uint4, sycl::ulonglong2, sycl::float2, - sycl::float4, sycl::double2>; + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec, + sycl::vec, sycl::vec, sycl::vec>; using ldg_types = sycl::detail::type_list) { return __nvvm_ldg_c(ptr); + } else if constexpr (std::is_same_v) { + return __nvvm_ldg_sc(ptr); } else if constexpr (std::is_same_v) { return __nvvm_ldg_s(ptr); } else if constexpr (std::is_same_v) { @@ -66,146 +81,387 @@ ldg(const T *ptr) { return __nvvm_ldg_ul(ptr); } else if constexpr (std::is_same_v) { return __nvvm_ldg_ull(ptr); + } else if constexpr (std::is_same_v) { + auto native = reinterpret_cast(ptr); + return __nvvm_ldg_h(native); } else if constexpr (std::is_same_v) { return __nvvm_ldg_f(ptr); } else if constexpr (std::is_same_v) { return __nvvm_ldg_d(ptr); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { // We can assume that ptr is aligned at least to char2's alignment, but the // load will assume that ptr is aligned to char2's alignment. This is only // safe if alignof(f2) <= alignof(char2). typedef char c2 ATTRIBUTE_EXT_VEC_TYPE(2); c2 rv = __nvvm_ldg_c2(reinterpret_cast(ptr)); - sycl::char2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef char c2 ATTRIBUTE_EXT_VEC_TYPE(2); + c2 rv_2 = __nvvm_ldg_c2(reinterpret_cast(ptr)); + char rv = __nvvm_ldg_c(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef char c4 ATTRIBUTE_EXT_VEC_TYPE(4); c4 rv = __nvvm_ldg_c4(reinterpret_cast(ptr)); - sycl::char4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef signed char sc2 ATTRIBUTE_EXT_VEC_TYPE(2); + sc2 rv = __nvvm_ldg_sc2(reinterpret_cast(ptr)); + sycl::vec ret; + ret.x() = rv[0]; + ret.y() = rv[1]; + return ret; + } else if constexpr (std::is_same_v>) { + typedef signed char sc2 ATTRIBUTE_EXT_VEC_TYPE(2); + sc2 rv_2 = __nvvm_ldg_sc2(reinterpret_cast(ptr)); + signed char rv = __nvvm_ldg_sc(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef signed char sc4 ATTRIBUTE_EXT_VEC_TYPE(4); + sc4 rv = __nvvm_ldg_sc4(reinterpret_cast(ptr)); + sycl::vec ret; + ret.x() = rv[0]; + ret.y() = rv[1]; + ret.z() = rv[2]; + ret.w() = rv[3]; + return ret; + } else if constexpr (std::is_same_v>) { typedef short s2 ATTRIBUTE_EXT_VEC_TYPE(2); s2 rv = __nvvm_ldg_s2(reinterpret_cast(ptr)); - sycl::short2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef short s2 ATTRIBUTE_EXT_VEC_TYPE(2); + s2 rv_2 = __nvvm_ldg_s2(reinterpret_cast(ptr)); + short rv = __nvvm_ldg_s(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef short s4 ATTRIBUTE_EXT_VEC_TYPE(4); s4 rv = __nvvm_ldg_s4(reinterpret_cast(ptr)); - sycl::short4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { typedef int i2 ATTRIBUTE_EXT_VEC_TYPE(2); i2 rv = __nvvm_ldg_i2(reinterpret_cast(ptr)); - sycl::int2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef int i2 ATTRIBUTE_EXT_VEC_TYPE(2); + i2 rv_2 = __nvvm_ldg_i2(reinterpret_cast(ptr)); + int rv = __nvvm_ldg_i(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef int i4 ATTRIBUTE_EXT_VEC_TYPE(4); i4 rv = __nvvm_ldg_i4(reinterpret_cast(ptr)); - sycl::int4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2); + l2 rv = __nvvm_ldg_l2(reinterpret_cast(ptr)); + sycl::vec ret; + ret.x() = rv[0]; + ret.y() = rv[1]; + return ret; + } else if constexpr (std::is_same_v>) { + typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2); + l2 rv_2 = __nvvm_ldg_l2(reinterpret_cast(ptr)); + long rv = __nvvm_ldg_l(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2); + l2 rv1 = __nvvm_ldg_l2(reinterpret_cast(ptr)); + l2 rv2 = __nvvm_ldg_l2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; + } else if constexpr (std::is_same_v>) { typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2); ll2 rv = __nvvm_ldg_ll2(reinterpret_cast(ptr)); - sycl::longlong2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2); + ll2 rv_2 = __nvvm_ldg_ll2(reinterpret_cast(ptr)); + long long rv = __nvvm_ldg_ll(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2); + ll2 rv1 = __nvvm_ldg_ll2(reinterpret_cast(ptr)); + ll2 rv2 = __nvvm_ldg_ll2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; + } else if constexpr (std::is_same_v>) { typedef unsigned char uc2 ATTRIBUTE_EXT_VEC_TYPE(2); uc2 rv = __nvvm_ldg_uc2(reinterpret_cast(ptr)); - sycl::uchar2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef unsigned char uc2 ATTRIBUTE_EXT_VEC_TYPE(2); + uc2 rv_2 = __nvvm_ldg_uc2(reinterpret_cast(ptr)); + unsigned char rv = __nvvm_ldg_uc(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef unsigned char uc4 ATTRIBUTE_EXT_VEC_TYPE(4); uc4 rv = __nvvm_ldg_uc4(reinterpret_cast(ptr)); - sycl::uchar4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { typedef unsigned short us2 ATTRIBUTE_EXT_VEC_TYPE(2); us2 rv = __nvvm_ldg_us2(reinterpret_cast(ptr)); - sycl::ushort2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef unsigned short us2 ATTRIBUTE_EXT_VEC_TYPE(2); + us2 rv_2 = __nvvm_ldg_us2(reinterpret_cast(ptr)); + unsigned short rv = __nvvm_ldg_us(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef unsigned short us4 ATTRIBUTE_EXT_VEC_TYPE(4); us4 rv = __nvvm_ldg_us4(reinterpret_cast(ptr)); - sycl::ushort4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { typedef unsigned int ui2 ATTRIBUTE_EXT_VEC_TYPE(2); ui2 rv = __nvvm_ldg_ui2(reinterpret_cast(ptr)); - sycl::uint2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef unsigned int ui2 ATTRIBUTE_EXT_VEC_TYPE(2); + ui2 rv_2 = __nvvm_ldg_ui2(reinterpret_cast(ptr)); + unsigned int rv = __nvvm_ldg_ui(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef unsigned int ui4 ATTRIBUTE_EXT_VEC_TYPE(4); ui4 rv = __nvvm_ldg_ui4(reinterpret_cast(ptr)); - sycl::uint4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2); + ul2 rv = __nvvm_ldg_ul2(reinterpret_cast(ptr)); + sycl::vec ret; + ret.x() = rv[0]; + ret.y() = rv[1]; + return ret; + } else if constexpr (std::is_same_v>) { + typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2); + ul2 rv_2 = __nvvm_ldg_ul2(reinterpret_cast(ptr)); + unsigned long rv = __nvvm_ldg_ul(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2); + ul2 rv1 = __nvvm_ldg_ul2(reinterpret_cast(ptr)); + ul2 rv2 = __nvvm_ldg_ul2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; + } else if constexpr (std::is_same_v>) { typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2); ull2 rv = __nvvm_ldg_ull2(reinterpret_cast(ptr)); - sycl::ulonglong2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2); + ull2 rv_2 = __nvvm_ldg_ull2(reinterpret_cast(ptr)); + unsigned long long rv = + __nvvm_ldg_ull(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2); + ull2 rv1 = __nvvm_ldg_ull2(reinterpret_cast(ptr)); + ull2 rv2 = __nvvm_ldg_ull2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; + } else if constexpr (std::is_same_v>) { + typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2); + auto rv = __nvvm_ldg_h2(reinterpret_cast(ptr)); + sycl::vec ret; + ret.x() = rv[0]; + ret.y() = rv[1]; + return ret; + } else if constexpr (std::is_same_v>) { + typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2); + h2 rv_2 = __nvvm_ldg_h2(reinterpret_cast(ptr)); + auto rv = __nvvm_ldg_h(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2); + auto rv1 = __nvvm_ldg_h2(reinterpret_cast(ptr)); + auto rv2 = __nvvm_ldg_h2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; + } else if constexpr (std::is_same_v>) { typedef float f2 ATTRIBUTE_EXT_VEC_TYPE(2); f2 rv = __nvvm_ldg_f2(reinterpret_cast(ptr)); - sycl::float2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { + typedef float f2 ATTRIBUTE_EXT_VEC_TYPE(2); + f2 rv_2 = __nvvm_ldg_f2(reinterpret_cast(ptr)); + float rv = __nvvm_ldg_f(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { typedef float f4 ATTRIBUTE_EXT_VEC_TYPE(4); f4 rv = __nvvm_ldg_f4(reinterpret_cast(ptr)); - sycl::float4 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; ret.z() = rv[2]; ret.w() = rv[3]; return ret; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v>) { typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2); d2 rv = __nvvm_ldg_d2(reinterpret_cast(ptr)); - sycl::double2 ret; + sycl::vec ret; ret.x() = rv[0]; ret.y() = rv[1]; return ret; + } else if constexpr (std::is_same_v>) { + typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2); + d2 rv_2 = __nvvm_ldg_d2(reinterpret_cast(ptr)); + double rv = __nvvm_ldg_d(reinterpret_cast( + std::next(reinterpret_cast(ptr)))); + sycl::vec ret; + ret.x() = rv_2[0]; + ret.y() = rv_2[1]; + ret.z() = rv; + return ret; + } else if constexpr (std::is_same_v>) { + typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2); + d2 rv1 = __nvvm_ldg_d2(reinterpret_cast(ptr)); + d2 rv2 = __nvvm_ldg_d2(std::next(reinterpret_cast(ptr))); + sycl::vec ret; + ret.x() = rv1[0]; + ret.y() = rv1[1]; + ret.z() = rv2[0]; + ret.w() = rv2[1]; + return ret; } #else return *ptr; diff --git a/sycl/test/check_device_code/cuda/ldg.cpp b/sycl/test/check_device_code/cuda/ldg.cpp index 2cddb4fc8818d..e0ae28ac6a5f3 100644 --- a/sycl/test/check_device_code/cuda/ldg.cpp +++ b/sycl/test/check_device_code/cuda/ldg.cpp @@ -1,7 +1,7 @@ // REQUIRES: cuda -// RUN: %clangxx -Xclang -no-opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o -| FileCheck %s -// RUN: %clangxx -Xclang -opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE +// RUN: %clangxx -Xclang -no-opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s +// RUN: %clangxx -Xclang -opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE #include #include @@ -15,6 +15,7 @@ int main() { sycl::queue q; auto *in_c = sycl::malloc_device(1, q); + auto *in_sc = sycl::malloc_device(1, q); auto *in_s = sycl::malloc_device(1, q); auto *in_i = sycl::malloc_device(1, q); auto *in_l = sycl::malloc_device(1, q); @@ -26,38 +27,64 @@ int main() { auto *in_ul = sycl::malloc_device(1, q); auto *in_ull = sycl::malloc_device(1, q); - auto *in_c2 = sycl::malloc_device(1, q); - auto *in_s2 = sycl::malloc_device(1, q); - auto *in_i2 = sycl::malloc_device(1, q); - auto *in_ll2 = sycl::malloc_device(1, q); + auto *in_c2 = sycl::malloc_device>(1, q); + auto *in_c3 = sycl::malloc_device>(1, q); + auto *in_sc2 = sycl::malloc_device>(1, q); + auto *in_sc3 = sycl::malloc_device>(1, q); + auto *in_s2 = sycl::malloc_device>(1, q); + auto *in_s3 = sycl::malloc_device>(1, q); + auto *in_i2 = sycl::malloc_device>(1, q); + auto *in_i3 = sycl::malloc_device>(1, q); + auto *in_l2 = sycl::malloc_device>(1, q); + auto *in_l3 = sycl::malloc_device>(1, q); + auto *in_ll2 = sycl::malloc_device>(1, q); + auto *in_ll3 = sycl::malloc_device>(1, q); + auto *in_l4 = sycl::malloc_device>(1, q); + auto *in_ll4 = sycl::malloc_device>(1, q); - auto *in_c4 = sycl::malloc_device(1, q); - auto *in_s4 = sycl::malloc_device(1, q); - auto *in_i4 = sycl::malloc_device(1, q); + auto *in_c4 = sycl::malloc_device>(1, q); + auto *in_sc4 = sycl::malloc_device>(1, q); + auto *in_s4 = sycl::malloc_device>(1, q); + auto *in_i4 = sycl::malloc_device>(1, q); - auto *in_uc2 = sycl::malloc_device(1, q); - auto *in_us2 = sycl::malloc_device(1, q); - auto *in_ui2 = sycl::malloc_device(1, q); - auto *in_ull2 = sycl::malloc_device(1, q); + auto *in_uc2 = sycl::malloc_device>(1, q); + auto *in_uc3 = sycl::malloc_device>(1, q); + auto *in_us2 = sycl::malloc_device>(1, q); + auto *in_us3 = sycl::malloc_device>(1, q); + auto *in_ui2 = sycl::malloc_device>(1, q); + auto *in_ui3 = sycl::malloc_device>(1, q); + auto *in_ul2 = sycl::malloc_device>(1, q); + auto *in_ul3 = sycl::malloc_device>(1, q); + auto *in_ull2 = sycl::malloc_device>(1, q); + auto *in_ull3 = sycl::malloc_device>(1, q); + auto *in_ul4 = sycl::malloc_device>(1, q); + auto *in_ull4 = sycl::malloc_device>(1, q); - auto *in_uc4 = sycl::malloc_device(1, q); - auto *in_us4 = sycl::malloc_device(1, q); - auto *in_ui4 = sycl::malloc_device(1, q); + auto *in_uc4 = sycl::malloc_device>(1, q); + auto *in_us4 = sycl::malloc_device>(1, q); + auto *in_ui4 = sycl::malloc_device>(1, q); + auto *in_h = sycl::malloc_device(1, q); auto *in_f = sycl::malloc_device(1, q); auto *in_d = sycl::malloc_device(1, q); - auto *in_f2 = sycl::malloc_device(1, q); - auto *in_d2 = sycl::malloc_device(1, q); - - auto *in_f4 = sycl::malloc_device(1, q); - - auto *out_d = sycl::malloc_device(1, q); + auto *in_h2 = sycl::malloc_device>(1, q); + auto *in_h3 = sycl::malloc_device>(1, q); + auto *in_h4 = sycl::malloc_device>(1, q); + auto *in_f2 = sycl::malloc_device>(1, q); + auto *in_f3 = sycl::malloc_device>(1, q); + auto *in_f4 = sycl::malloc_device>(1, q); + auto *in_d2 = sycl::malloc_device>(1, q); + auto *in_d3 = sycl::malloc_device>(1, q); + auto *in_d4 = sycl::malloc_device>(1, q); q.wait(); q.submit([=](sycl::handler &h) { h.single_task([=] { + //CHECK: tail call half @llvm.nvvm.ldg.global.f.f16.p0f16(half* %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2) + auto cached_h = ldg(&in_h[0]); //CHECK: tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %{{.*}}, i32 4) //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4) auto cached_f = ldg(&in_f[0]); @@ -65,12 +92,40 @@ int main() { //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8) auto cached_d = ldg(&in_d[0]); + //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4) + auto cached_h2 = ldg(&in_h2[0]); + //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4) + //CHECK: tail call half @llvm.nvvm.ldg.global.f.f16.p0f16(half* nonnull %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2) + auto cached_h3 = ldg(&in_h3[0]); + //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4) + //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* nonnull %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4) + auto cached_h4 = ldg(&in_h4[0]); //CHECK: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* %{{.*}}, i32 8) //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8) auto cached_f2 = ldg(&in_f2[0]); + //CHECK: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* %{{.*}}, i32 8) + //CHECK: tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nonnull %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4) + auto cached_f3 = ldg(&in_f3[0]); //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16) //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16) auto cached_d2 = ldg(&in_d2[0]); + //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16) + //CHECK: tail call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* nonnull %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8) + auto cached_d3 = ldg(&in_d3[0]); + //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16) + //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* nonnull %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16) + auto cached_d4 = ldg(&in_d4[0]); //CHECK: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16) //CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16) auto cached_f4 = ldg(&in_f4[0]); @@ -81,6 +136,9 @@ int main() { //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* %{{.*}}, i32 1) //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1) auto cached_c = ldg(&in_c[0]); + //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* %{{.*}}, i32 1) + //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1) + auto cached_sc = ldg(&in_sc[0]); //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* %{{.*}}, i32 2) //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2) auto cached_s = ldg(&in_s[0]); @@ -112,31 +170,118 @@ int main() { //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) auto cached_c2 = ldg(&in_c2[0]); + //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) + //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1) + //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1) + auto cached_c3 = ldg(&in_c3[0]); + //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) + auto cached_sc2 = ldg(&in_sc2[0]); + //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) + //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1) + //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1) + auto cached_sc3 = ldg(&in_sc3[0]); //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4) //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4) auto cached_s2 = ldg(&in_s2[0]); + //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4) + //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* nonnull %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2) + auto cached_s3 = ldg(&in_s3[0]); //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8) //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8) auto cached_i2 = ldg(&in_i2[0]); + //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8) + //CHECK: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* nonnull %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4) + auto cached_i3 = ldg(&in_i3[0]); + //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}}) + auto cached_l2 = ldg(&in_l2[0]); + //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}}) + //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8) + auto cached_l3 = ldg(&in_l3[0]); //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) auto cached_ll2 = ldg(&in_ll2[0]); + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) + //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8) + auto cached_ll3 = ldg(&in_ll3[0]); + //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}}) + //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* nonnull %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}}) + auto cached_l4 = ldg(&in_l4[0]); + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* nonnull %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16) + auto cached_ll4 = ldg(&in_ll4[0]); //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) auto cached_uc2 = ldg(&in_uc2[0]); + //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2) + //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1) + //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1) + auto cached_uc3 = ldg(&in_uc3[0]); //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4) //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4) auto cached_us2 = ldg(&in_us2[0]); + //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4) + //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* nonnull %{{.*}}, i32 2) + //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2) + auto cached_us3 = ldg(&in_us3[0]); //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8) //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8) auto cached_ui2 = ldg(&in_ui2[0]); + //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8) + //CHECK: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* nonnull %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4) + auto cached_ui3 = ldg(&in_ui3[0]); + //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}}) + auto cached_ul2 = ldg(&in_ul2[0]); + //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}}) + //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8) + auto cached_ul3 = ldg(&in_ul3[0]); //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) auto cached_ull2 = ldg(&in_ull2[0]); + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) + //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8) + auto cached_ull3 = ldg(&in_ull3[0]); + //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}}) + //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* nonnull %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}}) + //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}}) + auto cached_ul4 = ldg(&in_ul4[0]); + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16) + //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* nonnull %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16) + //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16) + auto cached_ull4 = ldg(&in_ull4[0]); //CHECK: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* %{{.*}}, i32 4) //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4) auto cached_c4 = ldg(&in_c4[0]); + //CHECK: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* %{{.*}}, i32 4) + //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4) + auto cached_sc4 = ldg(&in_sc4[0]); //CHECK: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* %{{.*}}, i32 8) //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8) auto cached_s4 = ldg(&in_s4[0]); @@ -158,12 +303,20 @@ int main() { q.wait(); + free(in_h, q); free(in_f, q); free(in_d, q); + free(in_h2, q); + free(in_h3, q); + free(in_h4, q); free(in_f2, q); + free(in_f3, q); free(in_f4, q); free(in_d2, q); + free(in_d3, q); + free(in_d4, q); free(in_c, q); + free(in_sc, q); free(in_s, q); free(in_i, q); free(in_l, q); @@ -174,14 +327,33 @@ int main() { free(in_ul, q); free(in_ull, q); free(in_c2, q); + free(in_c3, q); + free(in_sc2, q); + free(in_sc3, q); free(in_s2, q); + free(in_s3, q); free(in_i2, q); + free(in_i3, q); + free(in_l2, q); + free(in_l3, q); free(in_ll2, q); + free(in_ll3, q); + free(in_l4, q); + free(in_ll4, q); free(in_uc2, q); + free(in_uc3, q); free(in_us2, q); + free(in_us3, q); free(in_ui2, q); + free(in_ui3, q); + free(in_ul2, q); + free(in_ul3, q); free(in_ull2, q); + free(in_ull3, q); + free(in_ul4, q); + free(in_ull4, q); free(in_c4, q); + free(in_sc4, q); free(in_s4, q); free(in_i4, q); free(in_uc4, q);