diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 35cbef3632..ee8cbfff99 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: cuda
-// Temp xfail: test was merged early.
-// XFAIL: cuda
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
 // RUN: %t.out
 
@@ -65,15 +63,15 @@ void matrix_verify_op(queue q, big_matrix<T2, M * nWGperDim, N * nWGperDim> &C,
 
              auto sg = spmd_item.get_sub_group();
 
-             joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
-             joint_matrix<T, use::b, K, N, layout::row_major> sub_b;
-             joint_matrix<T2, use::accumulator, M, N> sub_c;
+             joint_matrix<sub_group, T, use::a, M, K, layout::row_major> sub_a;
+             joint_matrix<sub_group, T, use::b, K, N, layout::row_major> sub_b;
+             joint_matrix<sub_group, T2, use::accumulator, M, N> sub_c;
 
              joint_matrix_fill(sg, sub_a, 3);
              joint_matrix_fill(sg, sub_b, 1);
              joint_matrix_fill(sg, sub_c, -80);
 
-             auto wi_slice_a = sub_a.get_wi_data();
+             auto wi_slice_a = get_wi_data(sg, sub_a);
              for (int i = 0; i < wi_slice_a.length(); i++) {
                if constexpr (std::is_same_v<Operation, Logical>) {
                  if (wi_slice_a[i]) {
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
deleted file mode 100644
index 558451a3db..0000000000
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//==----------- element_wise_wi_marray.cpp  - DPC++ joint_matrix------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: cuda
-// Temp xfail: test was merged early.
-// XFAIL: cuda
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
-// RUN: %t.out
-
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 32
-
-template <typename T, size_t M, size_t K> void verify_wi_marray(queue q) {
-  int err = 0;
-  {
-    buffer<int> err_buf(&err, 1);
-    q.submit([&](handler &cgh) {
-       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
-
-       cgh.parallel_for<class marray_kernel>(
-           nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}),
-           [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] {
-             auto sg = spmd_item.get_sub_group();
-
-             joint_matrix<T, use::a, M, K, layout::row_major> sub_a;
-             joint_matrix<T, use::a, M, K, layout::row_major> sub_a_2;
-
-             joint_matrix_fill(sg, sub_a, -1);
-             joint_matrix_fill(sg, sub_a_2, -1);
-
-             auto wi_slice_a = sub_a.get_wi_data();
-             for (int i = 0; i < wi_slice_a.length(); i++) {
-               wi_slice_a[i] = fabs(wi_slice_a[i]);
-             }
-             sub_a_2.wi_marray = fabs(sub_a_2.wi_marray);
-
-             for (int i = 0; i < sub_a_2.wi_marray.size(); i++) {
-               if (sub_a_2.wi_marray[i] != wi_slice_a[i]) {
-                 ERR[0] = 1;
-               }
-             }
-           }); // parallel for
-     }).wait();
-  }
-  assert(err == 0);
-}
-
-int main() {
-
-  queue q;
-  auto computeCapability =
-      std::stof(q.get_device().get_info<sycl::info::device::backend_version>());
-
-  if (computeCapability >= 8.0) {
-    verify_wi_marray<bfloat16, 16, 16>(q);
-  }
-
-  return 0;
-}
diff --git a/SYCL/Matrix/joint_matrix_tensorcores.cpp b/SYCL/Matrix/joint_matrix_tensorcores.cpp
index 8a5a1ae82d..2f29bb0667 100644
--- a/SYCL/Matrix/joint_matrix_tensorcores.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcores.cpp
@@ -1,7 +1,5 @@
 
 // REQUIRES: cuda
-// Temp xfail: test was merged early.
-// XFAIL: cuda
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out
 // RUN: %t.out
 //
@@ -14,7 +12,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 constexpr float bf16_eps = 0.00390625;
 
@@ -146,9 +144,11 @@ void test(queue &q) {
             // column id of current submatrix of BIG C matrix
             const auto n = item.get_group().get_group_id()[1];
 
-            joint_matrix<T3, use::a, M, K, layout::row_major> sub_a;
-            joint_matrix<T3, use::b, K, N, layout::row_major> sub_b;
-            joint_matrix<std::remove_const_t<T2>, use::accumulator, M, N> sub_c;
+            joint_matrix<sub_group, T3, use::a, M, K, layout::row_major> sub_a;
+            joint_matrix<sub_group, T3, use::b, K, N, layout::row_major> sub_b;
+            joint_matrix<sub_group, std::remove_const_t<T2>, use::accumulator,
+                         M, N>
+                sub_c;
 
             joint_matrix_load(sg, sub_c,
                               accC.get_pointer() + (m * M) * Big_N + n * N,
@@ -165,11 +165,13 @@ void test(queue &q) {
 
               // round values to correct precision if using tf32
               if constexpr (std::is_same<T3, precision::tf32>::value) {
-                auto wi_size = sub_a.wi_marray.size();
-                assert(wi_size == sub_b.wi_marray.size());
+                auto wi_size = get_wi_data(sg, sub_a).length();
+                assert(wi_size == get_wi_data(sg, sub_b).length());
                 for (auto i = 0; i < wi_size; ++i) {
-                  sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]);
-                  sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]);
+                  get_wi_data(sg, sub_a)[i] =
+                      round_to_tf32(get_wi_data(sg, sub_a)[i]);
+                  get_wi_data(sg, sub_b)[i] =
+                      round_to_tf32(get_wi_data(sg, sub_b)[i]);
                 }
               }