diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc
index 8ed8c738651e4..45bd67818db6a 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_cuda_tex_cache_read.asciidoc
@@ -98,7 +98,8 @@ T ldg(const T* ptr);
 `ldg` returns the data of type `T` located at address `ptr`. When called from the `ext_oneapi_cuda` backend the data is cached in the read-only texture cache.
 When called from any other backend a copy of the data stored at address `ptr` is returned without using any special cache.
 
-The template parameter `T` can be one of `char`, `signed char`, `short`, `int`, `long`, `long long`, `unsigned char`, `unsigned short`, `unsigned int`, `unsigned long`, `unsigned long long`, `vec<char, 2>`, `vec<char, 4>`, `vec<short, 2>`, `vec<short, 4>`, `vec<int, 2>`, `vec<int, 4>`, `vec<long long, 2>`, `vec<uchar, 2>`, `vec<uchar, 4>`, `vec<ushort, 2>`, `vec<ushort, 4>`, `vec<uint, 2>`, `vec<uint, 4>`, `vec<unsigned long long, 2>`, `float`, `vec<float, 2>`, `vec<float, 4>`, `double`, or `vec<double, 2>`.
+The template parameter `T` can be one of `char`, `signed char`, `short`, `int`, `long`, `long long`, `unsigned char`, `unsigned short`, `unsigned int`, `unsigned long`, `unsigned long long`, `half`, `float`, `double`, `<char, 2>`, `<char, 3>`, `<char, 4>`, `<signed char, 2>`, `<signed char, 3>`, `<signed char, 4>`, `<short, 2>`, `<short, 3>`, `<short, 4>`, `<int, 2>`, `<int, 3>`, `<int, 4>`, `<long, 2>`, `<long, 3>`, `<long, 4>`, `<long long, 2>`, `<long long, 3>`, `<long long, 4>`, `<unsigned char, 2>`, `<unsigned char, 3>`, `<unsigned char, 4>`, `<unsigned short, 2>`, `<unsigned short, 3>`, `<unsigned short, 4>`, `<unsigned int, 2>`, `<unsigned int, 3>`, `<unsigned int, 4>`, `<unsigned long, 2>`, `<unsigned long, 3>`, `<unsigned long, 4>`, `<unsigned long long, 2>`, `<unsigned long long, 3>`, `<unsigned long long, 4>`, `<half, 2>`, `<half, 3>`, `<half, 4>`, `<float, 2>`, `<float, 3>`, `<float, 4>`, `<double, 2>`, `<double, 3>`, or `<double, 4>`.
+
 
 === Example of usage
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp
index cffeb6d855bb9..c09665d3f59cd 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/builtins.hpp
@@ -27,10 +27,23 @@ namespace cuda {
 
 namespace detail {
 using ldg_vector_types = sycl::detail::type_list<
-    sycl::char2, sycl::char4, sycl::short2, sycl::short4, sycl::int2,
-    sycl::int4, sycl::longlong2, sycl::uchar2, sycl::uchar4, sycl::ushort2,
-    sycl::ushort4, sycl::uint2, sycl::uint4, sycl::ulonglong2, sycl::float2,
-    sycl::float4, sycl::double2>;
+    sycl::vec<char, 2>, sycl::vec<char, 3>, sycl::vec<char, 4>,
+    sycl::vec<signed char, 2>, sycl::vec<signed char, 3>,
+    sycl::vec<signed char, 4>, sycl::vec<short, 2>, sycl::vec<short, 3>,
+    sycl::vec<short, 4>, sycl::vec<int, 2>, sycl::vec<int, 3>,
+    sycl::vec<int, 4>, sycl::vec<long, 2>, sycl::vec<long, 3>,
+    sycl::vec<long, 4>, sycl::vec<long long, 2>, sycl::vec<long long, 3>,
+    sycl::vec<long long, 4>, sycl::vec<unsigned char, 2>,
+    sycl::vec<unsigned char, 3>, sycl::vec<unsigned char, 4>,
+    sycl::vec<unsigned short, 2>, sycl::vec<unsigned short, 3>,
+    sycl::vec<unsigned short, 4>, sycl::vec<unsigned int, 2>,
+    sycl::vec<unsigned int, 3>, sycl::vec<unsigned int, 4>,
+    sycl::vec<unsigned long, 2>, sycl::vec<unsigned long, 3>,
+    sycl::vec<unsigned long, 4>, sycl::vec<unsigned long long, 2>,
+    sycl::vec<unsigned long long, 3>, sycl::vec<unsigned long long, 4>,
+    sycl::vec<half, 2>, sycl::vec<half, 3>, sycl::vec<half, 4>,
+    sycl::vec<float, 2>, sycl::vec<float, 3>, sycl::vec<float, 4>,
+    sycl::vec<double, 2>, sycl::vec<double, 3>, sycl::vec<double, 4>>;
 
 using ldg_types =
     sycl::detail::type_list<ldg_vector_types,
@@ -48,6 +61,8 @@ ldg(const T *ptr) {
 #if defined(__NVPTX__)
   if constexpr (std::is_same_v<T, char>) {
     return __nvvm_ldg_c(ptr);
+  } else if constexpr (std::is_same_v<T, signed char>) {
+    return __nvvm_ldg_sc(ptr);
   } else if constexpr (std::is_same_v<T, short>) {
     return __nvvm_ldg_s(ptr);
   } else if constexpr (std::is_same_v<T, int>) {
@@ -66,146 +81,387 @@ ldg(const T *ptr) {
     return __nvvm_ldg_ul(ptr);
   } else if constexpr (std::is_same_v<T, unsigned long long>) {
     return __nvvm_ldg_ull(ptr);
+  } else if constexpr (std::is_same_v<T, half>) {
+    auto native = reinterpret_cast<const __fp16 *>(ptr);
+    return __nvvm_ldg_h(native);
   } else if constexpr (std::is_same_v<T, float>) {
     return __nvvm_ldg_f(ptr);
   } else if constexpr (std::is_same_v<T, double>) {
     return __nvvm_ldg_d(ptr);
-  } else if constexpr (std::is_same_v<T, sycl::char2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<char, 2>>) {
     // We can assume that ptr is aligned at least to char2's alignment, but the
     // load will assume that ptr is aligned to char2's alignment.  This is only
     // safe if alignof(f2) <= alignof(char2).
     typedef char c2 ATTRIBUTE_EXT_VEC_TYPE(2);
     c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
-    sycl::char2 ret;
+    sycl::vec<char, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::char4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<char, 3>>) {
+    typedef char c2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    c2 rv_2 = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+    char rv = __nvvm_ldg_c(reinterpret_cast<const char *>(
+        std::next(reinterpret_cast<const c2 *>(ptr))));
+    sycl::vec<char, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<char, 4>>) {
     typedef char c4 ATTRIBUTE_EXT_VEC_TYPE(4);
     c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
-    sycl::char4 ret;
+    sycl::vec<char, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::short2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<signed char, 2>>) {
+    typedef signed char sc2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    sc2 rv = __nvvm_ldg_sc2(reinterpret_cast<const sc2 *>(ptr));
+    sycl::vec<signed char, 2> ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<signed char, 3>>) {
+    typedef signed char sc2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    sc2 rv_2 = __nvvm_ldg_sc2(reinterpret_cast<const sc2 *>(ptr));
+    signed char rv = __nvvm_ldg_sc(reinterpret_cast<const signed char *>(
+        std::next(reinterpret_cast<const sc2 *>(ptr))));
+    sycl::vec<signed char, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<signed char, 4>>) {
+    typedef signed char sc4 ATTRIBUTE_EXT_VEC_TYPE(4);
+    sc4 rv = __nvvm_ldg_sc4(reinterpret_cast<const sc4 *>(ptr));
+    sycl::vec<signed char, 4> ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    ret.z() = rv[2];
+    ret.w() = rv[3];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<short, 2>>) {
     typedef short s2 ATTRIBUTE_EXT_VEC_TYPE(2);
     s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
-    sycl::short2 ret;
+    sycl::vec<short, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::short4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<short, 3>>) {
+    typedef short s2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    s2 rv_2 = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+    short rv = __nvvm_ldg_s(reinterpret_cast<const short *>(
+        std::next(reinterpret_cast<const s2 *>(ptr))));
+    sycl::vec<short, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<short, 4>>) {
     typedef short s4 ATTRIBUTE_EXT_VEC_TYPE(4);
     s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
-    sycl::short4 ret;
+    sycl::vec<short, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::int2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<int, 2>>) {
     typedef int i2 ATTRIBUTE_EXT_VEC_TYPE(2);
     i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
-    sycl::int2 ret;
+    sycl::vec<int, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::int4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<int, 3>>) {
+    typedef int i2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    i2 rv_2 = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+    int rv = __nvvm_ldg_i(reinterpret_cast<const int *>(
+        std::next(reinterpret_cast<const i2 *>(ptr))));
+    sycl::vec<int, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<int, 4>>) {
     typedef int i4 ATTRIBUTE_EXT_VEC_TYPE(4);
     i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
-    sycl::int4 ret;
+    sycl::vec<int, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::longlong2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<long, 2>>) {
+    typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    l2 rv = __nvvm_ldg_l2(reinterpret_cast<const l2 *>(ptr));
+    sycl::vec<long, 2> ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<long, 3>>) {
+    typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    l2 rv_2 = __nvvm_ldg_l2(reinterpret_cast<const l2 *>(ptr));
+    long rv = __nvvm_ldg_l(reinterpret_cast<const long *>(
+        std::next(reinterpret_cast<const l2 *>(ptr))));
+    sycl::vec<long, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<long, 4>>) {
+    typedef long l2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    l2 rv1 = __nvvm_ldg_l2(reinterpret_cast<const l2 *>(ptr));
+    l2 rv2 = __nvvm_ldg_l2(std::next(reinterpret_cast<const l2 *>(ptr)));
+    sycl::vec<long, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<long long, 2>>) {
     typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2);
     ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
-    sycl::longlong2 ret;
+    sycl::vec<long long, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::uchar2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<long long, 3>>) {
+    typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ll2 rv_2 = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+    long long rv = __nvvm_ldg_ll(reinterpret_cast<const long long *>(
+        std::next(reinterpret_cast<const ll2 *>(ptr))));
+    sycl::vec<long long, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<long long, 4>>) {
+    typedef long long ll2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ll2 rv1 = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+    ll2 rv2 = __nvvm_ldg_ll2(std::next(reinterpret_cast<const ll2 *>(ptr)));
+    sycl::vec<long long, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned char, 2>>) {
     typedef unsigned char uc2 ATTRIBUTE_EXT_VEC_TYPE(2);
     uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
-    sycl::uchar2 ret;
+    sycl::vec<unsigned char, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::uchar4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned char, 3>>) {
+    typedef unsigned char uc2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    uc2 rv_2 = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+    unsigned char rv = __nvvm_ldg_uc(reinterpret_cast<const unsigned char *>(
+        std::next(reinterpret_cast<const uc2 *>(ptr))));
+    sycl::vec<unsigned char, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned char, 4>>) {
     typedef unsigned char uc4 ATTRIBUTE_EXT_VEC_TYPE(4);
     uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
-    sycl::uchar4 ret;
+    sycl::vec<unsigned char, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::ushort2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned short, 2>>) {
     typedef unsigned short us2 ATTRIBUTE_EXT_VEC_TYPE(2);
     us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
-    sycl::ushort2 ret;
+    sycl::vec<unsigned short, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::ushort4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned short, 3>>) {
+    typedef unsigned short us2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    us2 rv_2 = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+    unsigned short rv = __nvvm_ldg_us(reinterpret_cast<const unsigned short *>(
+        std::next(reinterpret_cast<const us2 *>(ptr))));
+    sycl::vec<unsigned short, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned short, 4>>) {
     typedef unsigned short us4 ATTRIBUTE_EXT_VEC_TYPE(4);
     us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
-    sycl::ushort4 ret;
+    sycl::vec<unsigned short, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::uint2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned int, 2>>) {
     typedef unsigned int ui2 ATTRIBUTE_EXT_VEC_TYPE(2);
     ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
-    sycl::uint2 ret;
+    sycl::vec<unsigned int, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::uint4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned int, 3>>) {
+    typedef unsigned int ui2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ui2 rv_2 = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+    unsigned int rv = __nvvm_ldg_ui(reinterpret_cast<const unsigned int *>(
+        std::next(reinterpret_cast<const ui2 *>(ptr))));
+    sycl::vec<unsigned int, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned int, 4>>) {
     typedef unsigned int ui4 ATTRIBUTE_EXT_VEC_TYPE(4);
     ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
-    sycl::uint4 ret;
+    sycl::vec<unsigned int, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::ulonglong2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long, 2>>) {
+    typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ul2 rv = __nvvm_ldg_ul2(reinterpret_cast<const ul2 *>(ptr));
+    sycl::vec<unsigned long, 2> ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long, 3>>) {
+    typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ul2 rv_2 = __nvvm_ldg_ul2(reinterpret_cast<const ul2 *>(ptr));
+    unsigned long rv = __nvvm_ldg_ul(reinterpret_cast<const unsigned long *>(
+        std::next(reinterpret_cast<const ul2 *>(ptr))));
+    sycl::vec<unsigned long, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long, 4>>) {
+    typedef unsigned long ul2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ul2 rv1 = __nvvm_ldg_ul2(reinterpret_cast<const ul2 *>(ptr));
+    ul2 rv2 = __nvvm_ldg_ul2(std::next(reinterpret_cast<const ul2 *>(ptr)));
+    sycl::vec<unsigned long, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long long, 2>>) {
     typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2);
     ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
-    sycl::ulonglong2 ret;
+    sycl::vec<unsigned long long, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::float2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long long, 3>>) {
+    typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ull2 rv_2 = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+    unsigned long long rv =
+        __nvvm_ldg_ull(reinterpret_cast<const unsigned long long *>(
+            std::next(reinterpret_cast<const ull2 *>(ptr))));
+    sycl::vec<unsigned long long, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<unsigned long long, 4>>) {
+    typedef unsigned long long ull2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    ull2 rv1 = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+    ull2 rv2 = __nvvm_ldg_ull2(std::next(reinterpret_cast<const ull2 *>(ptr)));
+    sycl::vec<unsigned long long, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<half, 2>>) {
+    typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    auto rv = __nvvm_ldg_h2(reinterpret_cast<const h2 *>(ptr));
+    sycl::vec<half, 2> ret;
+    ret.x() = rv[0];
+    ret.y() = rv[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<half, 3>>) {
+    typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    h2 rv_2 = __nvvm_ldg_h2(reinterpret_cast<const h2 *>(ptr));
+    auto rv = __nvvm_ldg_h(reinterpret_cast<const __fp16 *>(
+        std::next(reinterpret_cast<const h2 *>(ptr))));
+    sycl::vec<half, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<half, 4>>) {
+    typedef __fp16 h2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    auto rv1 = __nvvm_ldg_h2(reinterpret_cast<const h2 *>(ptr));
+    auto rv2 = __nvvm_ldg_h2(std::next(reinterpret_cast<const h2 *>(ptr)));
+    sycl::vec<half, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<float, 2>>) {
     typedef float f2 ATTRIBUTE_EXT_VEC_TYPE(2);
     f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
-    sycl::float2 ret;
+    sycl::vec<float, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::float4>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<float, 3>>) {
+    typedef float f2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    f2 rv_2 = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+    float rv = __nvvm_ldg_f(reinterpret_cast<const float *>(
+        std::next(reinterpret_cast<const f2 *>(ptr))));
+    sycl::vec<float, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<float, 4>>) {
     typedef float f4 ATTRIBUTE_EXT_VEC_TYPE(4);
     f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
-    sycl::float4 ret;
+    sycl::vec<float, 4> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     ret.z() = rv[2];
     ret.w() = rv[3];
     return ret;
-  } else if constexpr (std::is_same_v<T, sycl::double2>) {
+  } else if constexpr (std::is_same_v<T, sycl::vec<double, 2>>) {
     typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2);
     d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
-    sycl::double2 ret;
+    sycl::vec<double, 2> ret;
     ret.x() = rv[0];
     ret.y() = rv[1];
     return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<double, 3>>) {
+    typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    d2 rv_2 = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+    double rv = __nvvm_ldg_d(reinterpret_cast<const double *>(
+        std::next(reinterpret_cast<const d2 *>(ptr))));
+    sycl::vec<double, 3> ret;
+    ret.x() = rv_2[0];
+    ret.y() = rv_2[1];
+    ret.z() = rv;
+    return ret;
+  } else if constexpr (std::is_same_v<T, sycl::vec<double, 4>>) {
+    typedef double d2 ATTRIBUTE_EXT_VEC_TYPE(2);
+    d2 rv1 = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+    d2 rv2 = __nvvm_ldg_d2(std::next(reinterpret_cast<const d2 *>(ptr)));
+    sycl::vec<double, 4> ret;
+    ret.x() = rv1[0];
+    ret.y() = rv1[1];
+    ret.z() = rv2[0];
+    ret.w() = rv2[1];
+    return ret;
   }
 #else
   return *ptr;
diff --git a/sycl/test/check_device_code/cuda/ldg.cpp b/sycl/test/check_device_code/cuda/ldg.cpp
index 2cddb4fc8818d..e0ae28ac6a5f3 100644
--- a/sycl/test/check_device_code/cuda/ldg.cpp
+++ b/sycl/test/check_device_code/cuda/ldg.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: cuda
 
-// RUN: %clangxx -Xclang -no-opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o -| FileCheck %s
-// RUN: %clangxx -Xclang -opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE
+// RUN: %clangxx -Xclang -no-opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s
+// RUN: %clangxx -Xclang -opaque-pointers -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE
 
 #include <sycl/ext/oneapi/experimental/cuda/builtins.hpp>
 #include <sycl/sycl.hpp>
@@ -15,6 +15,7 @@ int main() {
   sycl::queue q;
 
   auto *in_c = sycl::malloc_device<char>(1, q);
+  auto *in_sc = sycl::malloc_device<signed char>(1, q);
   auto *in_s = sycl::malloc_device<short>(1, q);
   auto *in_i = sycl::malloc_device<int>(1, q);
   auto *in_l = sycl::malloc_device<long>(1, q);
@@ -26,38 +27,64 @@ int main() {
   auto *in_ul = sycl::malloc_device<unsigned long>(1, q);
   auto *in_ull = sycl::malloc_device<unsigned long long>(1, q);
 
-  auto *in_c2 = sycl::malloc_device<char2>(1, q);
-  auto *in_s2 = sycl::malloc_device<short2>(1, q);
-  auto *in_i2 = sycl::malloc_device<int2>(1, q);
-  auto *in_ll2 = sycl::malloc_device<longlong2>(1, q);
+  auto *in_c2 = sycl::malloc_device<sycl::vec<char, 2>>(1, q);
+  auto *in_c3 = sycl::malloc_device<sycl::vec<char, 3>>(1, q);
+  auto *in_sc2 = sycl::malloc_device<sycl::vec<signed char, 2>>(1, q);
+  auto *in_sc3 = sycl::malloc_device<sycl::vec<signed char, 3>>(1, q);
+  auto *in_s2 = sycl::malloc_device<sycl::vec<short, 2>>(1, q);
+  auto *in_s3 = sycl::malloc_device<sycl::vec<short, 3>>(1, q);
+  auto *in_i2 = sycl::malloc_device<sycl::vec<int, 2>>(1, q);
+  auto *in_i3 = sycl::malloc_device<sycl::vec<int, 3>>(1, q);
+  auto *in_l2 = sycl::malloc_device<sycl::vec<long, 2>>(1, q);
+  auto *in_l3 = sycl::malloc_device<sycl::vec<long, 3>>(1, q);
+  auto *in_ll2 = sycl::malloc_device<sycl::vec<long long, 2>>(1, q);
+  auto *in_ll3 = sycl::malloc_device<sycl::vec<long long, 3>>(1, q);
+  auto *in_l4 = sycl::malloc_device<sycl::vec<long, 4>>(1, q);
+  auto *in_ll4 = sycl::malloc_device<sycl::vec<long long, 4>>(1, q);
 
-  auto *in_c4 = sycl::malloc_device<char4>(1, q);
-  auto *in_s4 = sycl::malloc_device<short4>(1, q);
-  auto *in_i4 = sycl::malloc_device<int4>(1, q);
+  auto *in_c4 = sycl::malloc_device<sycl::vec<char, 4>>(1, q);
+  auto *in_sc4 = sycl::malloc_device<sycl::vec<signed char, 4>>(1, q);
+  auto *in_s4 = sycl::malloc_device<sycl::vec<short, 4>>(1, q);
+  auto *in_i4 = sycl::malloc_device<sycl::vec<int, 4>>(1, q);
 
-  auto *in_uc2 = sycl::malloc_device<uchar2>(1, q);
-  auto *in_us2 = sycl::malloc_device<ushort2>(1, q);
-  auto *in_ui2 = sycl::malloc_device<uint2>(1, q);
-  auto *in_ull2 = sycl::malloc_device<ulonglong2>(1, q);
+  auto *in_uc2 = sycl::malloc_device<sycl::vec<unsigned char, 2>>(1, q);
+  auto *in_uc3 = sycl::malloc_device<sycl::vec<unsigned char, 3>>(1, q);
+  auto *in_us2 = sycl::malloc_device<sycl::vec<unsigned short, 2>>(1, q);
+  auto *in_us3 = sycl::malloc_device<sycl::vec<unsigned short, 3>>(1, q);
+  auto *in_ui2 = sycl::malloc_device<sycl::vec<unsigned int, 2>>(1, q);
+  auto *in_ui3 = sycl::malloc_device<sycl::vec<unsigned int, 3>>(1, q);
+  auto *in_ul2 = sycl::malloc_device<sycl::vec<unsigned long, 2>>(1, q);
+  auto *in_ul3 = sycl::malloc_device<sycl::vec<unsigned long, 3>>(1, q);
+  auto *in_ull2 = sycl::malloc_device<sycl::vec<unsigned long long, 2>>(1, q);
+  auto *in_ull3 = sycl::malloc_device<sycl::vec<unsigned long long, 3>>(1, q);
+  auto *in_ul4 = sycl::malloc_device<sycl::vec<unsigned long, 4>>(1, q);
+  auto *in_ull4 = sycl::malloc_device<sycl::vec<unsigned long long, 4>>(1, q);
 
-  auto *in_uc4 = sycl::malloc_device<uchar4>(1, q);
-  auto *in_us4 = sycl::malloc_device<ushort4>(1, q);
-  auto *in_ui4 = sycl::malloc_device<uint4>(1, q);
+  auto *in_uc4 = sycl::malloc_device<sycl::vec<unsigned char, 4>>(1, q);
+  auto *in_us4 = sycl::malloc_device<sycl::vec<unsigned short, 4>>(1, q);
+  auto *in_ui4 = sycl::malloc_device<sycl::vec<unsigned int, 4>>(1, q);
 
+  auto *in_h = sycl::malloc_device<half>(1, q);
   auto *in_f = sycl::malloc_device<float>(1, q);
   auto *in_d = sycl::malloc_device<double>(1, q);
 
-  auto *in_f2 = sycl::malloc_device<float2>(1, q);
-  auto *in_d2 = sycl::malloc_device<double2>(1, q);
-
-  auto *in_f4 = sycl::malloc_device<float4>(1, q);
-
-  auto *out_d = sycl::malloc_device<double>(1, q);
+  auto *in_h2 = sycl::malloc_device<sycl::vec<half, 2>>(1, q);
+  auto *in_h3 = sycl::malloc_device<sycl::vec<half, 3>>(1, q);
+  auto *in_h4 = sycl::malloc_device<sycl::vec<half, 4>>(1, q);
+  auto *in_f2 = sycl::malloc_device<sycl::vec<float, 2>>(1, q);
+  auto *in_f3 = sycl::malloc_device<sycl::vec<float, 3>>(1, q);
+  auto *in_f4 = sycl::malloc_device<sycl::vec<float, 4>>(1, q);
+  auto *in_d2 = sycl::malloc_device<sycl::vec<double, 2>>(1, q);
+  auto *in_d3 = sycl::malloc_device<sycl::vec<double, 3>>(1, q);
+  auto *in_d4 = sycl::malloc_device<sycl::vec<double, 4>>(1, q);
 
   q.wait();
 
   q.submit([=](sycl::handler &h) {
     h.single_task<class check>([=] {
+      //CHECK: tail call half @llvm.nvvm.ldg.global.f.f16.p0f16(half* %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2)
+      auto cached_h = ldg(&in_h[0]);
       //CHECK: tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %{{.*}}, i32 4)
       //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4)
       auto cached_f = ldg(&in_f[0]);
@@ -65,12 +92,40 @@ int main() {
       //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8)
       auto cached_d = ldg(&in_d[0]);
 
+      //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+      auto cached_h2 = ldg(&in_h2[0]);
+      //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4)
+      //CHECK: tail call half @llvm.nvvm.ldg.global.f.f16.p0f16(half* nonnull %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2)
+      auto cached_h3 = ldg(&in_h3[0]);
+      //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* %{{.*}}, i32 4)
+      //CHECK: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0v2f16(<2 x half>* nonnull %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4)
+      auto cached_h4 = ldg(&in_h4[0]);
       //CHECK: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* %{{.*}}, i32 8)
       //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
       auto cached_f2 = ldg(&in_f2[0]);
+      //CHECK: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* %{{.*}}, i32 8)
+      //CHECK: tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nonnull %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4)
+      auto cached_f3 = ldg(&in_f3[0]);
       //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16)
       //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
       auto cached_d2 = ldg(&in_d2[0]);
+      //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16)
+      //CHECK: tail call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* nonnull %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8)
+      auto cached_d3 = ldg(&in_d3[0]);
+      //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16)
+      //CHECK: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* nonnull %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16)
+      auto cached_d4 = ldg(&in_d4[0]);
       //CHECK: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16)
       //CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16)
       auto cached_f4 = ldg(&in_f4[0]);
@@ -81,6 +136,9 @@ int main() {
       //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* %{{.*}}, i32 1)
       //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
       auto cached_c = ldg(&in_c[0]);
+      //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* %{{.*}}, i32 1)
+      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+      auto cached_sc = ldg(&in_sc[0]);
       //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* %{{.*}}, i32 2)
       //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
       auto cached_s = ldg(&in_s[0]);
@@ -112,31 +170,118 @@ int main() {
       //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
       //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
       auto cached_c2 = ldg(&in_c2[0]);
+      //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
+      //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1)
+      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+      auto cached_c3 = ldg(&in_c3[0]);
+      //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+      auto cached_sc2 = ldg(&in_sc2[0]);
+      //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
+      //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1)
+      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+      auto cached_sc3 = ldg(&in_sc3[0]);
       //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4)
       //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
       auto cached_s2 = ldg(&in_s2[0]);
+      //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4)
+      //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* nonnull %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+      auto cached_s3 = ldg(&in_s3[0]);
       //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8)
       //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
       auto cached_i2 = ldg(&in_i2[0]);
+      //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8)
+      //CHECK: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* nonnull %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+      auto cached_i3 = ldg(&in_i3[0]);
+      //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      auto cached_l2 = ldg(&in_l2[0]);
+      //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+      auto cached_l3 = ldg(&in_l3[0]);
       //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
       //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
       auto cached_ll2 = ldg(&in_ll2[0]);
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
+      //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+      auto cached_ll3 = ldg(&in_ll3[0]);
+      //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0v2i{{32|64}}(<2 x i{{32|64}}>* nonnull %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
+      auto cached_l4 = ldg(&in_l4[0]);
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* nonnull %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+      auto cached_ll4 = ldg(&in_ll4[0]);
       //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
       //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
       auto cached_uc2 = ldg(&in_uc2[0]);
+      //CHECK: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* %{{.*}}, i32 2)
+      //CHECK: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* nonnull %{{.*}}, i32 1)
+      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+      auto cached_uc3 = ldg(&in_uc3[0]);
       //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4)
       //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
       auto cached_us2 = ldg(&in_us2[0]);
+      //CHECK: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* %{{.*}}, i32 4)
+      //CHECK: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* nonnull %{{.*}}, i32 2)
+      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+      auto cached_us3 = ldg(&in_us3[0]);
       //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8)
       //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
       auto cached_ui2 = ldg(&in_ui2[0]);
+      //CHECK: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* %{{.*}}, i32 8)
+      //CHECK: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* nonnull %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+      auto cached_ui3 = ldg(&in_ui3[0]);
+      //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      auto cached_ul2 = ldg(&in_ul2[0]);
+      //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+      auto cached_ul3 = ldg(&in_ul3[0]);
       //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
       //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
       auto cached_ull2 = ldg(&in_ull2[0]);
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
+      //CHECK: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* nonnull %{{.*}}, i32 8)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+      auto cached_ull3 = ldg(&in_ull3[0]);
+      //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* %{{.*}}, i32 {{8|16}})
+      //CHECK: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0v2i{{64|32}}(<2 x i{{64|32}}>* nonnull %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
+      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
+      auto cached_ul4 = ldg(&in_ul4[0]);
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16)
+      //CHECK: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* nonnull %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+      auto cached_ull4 = ldg(&in_ull4[0]);
 
       //CHECK: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* %{{.*}}, i32 4)
       //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
       auto cached_c4 = ldg(&in_c4[0]);
+      //CHECK: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* %{{.*}}, i32 4)
+      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+      auto cached_sc4 = ldg(&in_sc4[0]);
       //CHECK: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* %{{.*}}, i32 8)
       //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
       auto cached_s4 = ldg(&in_s4[0]);
@@ -158,12 +303,20 @@ int main() {
 
   q.wait();
 
+  free(in_h, q);
   free(in_f, q);
   free(in_d, q);
+  free(in_h2, q);
+  free(in_h3, q);
+  free(in_h4, q);
   free(in_f2, q);
+  free(in_f3, q);
   free(in_f4, q);
   free(in_d2, q);
+  free(in_d3, q);
+  free(in_d4, q);
   free(in_c, q);
+  free(in_sc, q);
   free(in_s, q);
   free(in_i, q);
   free(in_l, q);
@@ -174,14 +327,33 @@ int main() {
   free(in_ul, q);
   free(in_ull, q);
   free(in_c2, q);
+  free(in_c3, q);
+  free(in_sc2, q);
+  free(in_sc3, q);
   free(in_s2, q);
+  free(in_s3, q);
   free(in_i2, q);
+  free(in_i3, q);
+  free(in_l2, q);
+  free(in_l3, q);
   free(in_ll2, q);
+  free(in_ll3, q);
+  free(in_l4, q);
+  free(in_ll4, q);
   free(in_uc2, q);
+  free(in_uc3, q);
   free(in_us2, q);
+  free(in_us3, q);
   free(in_ui2, q);
+  free(in_ui3, q);
+  free(in_ul2, q);
+  free(in_ul3, q);
   free(in_ull2, q);
+  free(in_ull3, q);
+  free(in_ul4, q);
+  free(in_ull4, q);
   free(in_c4, q);
+  free(in_sc4, q);
   free(in_s4, q);
   free(in_i4, q);
   free(in_uc4, q);