diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp index 35cbef3632..ee8cbfff99 100644 --- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp +++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp @@ -6,8 +6,6 @@ // //===----------------------------------------------------------------------===// // REQUIRES: cuda -// Temp xfail: test was merged early. -// XFAIL: cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out // RUN: %t.out @@ -65,15 +63,15 @@ void matrix_verify_op(queue q, big_matrix &C, auto sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; joint_matrix_fill(sg, sub_a, 3); joint_matrix_fill(sg, sub_b, 1); joint_matrix_fill(sg, sub_c, -80); - auto wi_slice_a = sub_a.get_wi_data(); + auto wi_slice_a = get_wi_data(sg, sub_a); for (int i = 0; i < wi_slice_a.length(); i++) { if constexpr (std::is_same_v) { if (wi_slice_a[i]) { diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp deleted file mode 100644 index 558451a3db..0000000000 --- a/SYCL/Matrix/element_wise_wi_marray.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//==----------- element_wise_wi_marray.cpp - DPC++ joint_matrix------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: cuda -// Temp xfail: test was merged early. -// XFAIL: cuda -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out -// RUN: %t.out - -#include - -using namespace sycl; -using namespace sycl::ext::oneapi; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 32 - -template void verify_wi_marray(queue q) { - int err = 0; - { - buffer err_buf(&err, 1); - q.submit([&](handler &cgh) { - accessor ERR(err_buf, cgh); - - cgh.parallel_for( - nd_range<2>({1, 1 * SG_SZ}, {1, 1 * SG_SZ}), - [ERR](nd_item<2> spmd_item) [[sycl::reqd_sub_group_size(SG_SZ)]] { - auto sg = spmd_item.get_sub_group(); - - joint_matrix sub_a; - joint_matrix sub_a_2; - - joint_matrix_fill(sg, sub_a, -1); - joint_matrix_fill(sg, sub_a_2, -1); - - auto wi_slice_a = sub_a.get_wi_data(); - for (int i = 0; i < wi_slice_a.length(); i++) { - wi_slice_a[i] = fabs(wi_slice_a[i]); - } - sub_a_2.wi_marray = fabs(sub_a_2.wi_marray); - - for (int i = 0; i < sub_a_2.wi_marray.size(); i++) { - if (sub_a_2.wi_marray[i] != wi_slice_a[i]) { - ERR[0] = 1; - } - } - }); // parallel for - }).wait(); - } - assert(err == 0); -} - -int main() { - - queue q; - auto computeCapability = - std::stof(q.get_device().get_info()); - - if (computeCapability >= 8.0) { - verify_wi_marray(q); - } - - return 0; -} diff --git a/SYCL/Matrix/joint_matrix_tensorcores.cpp b/SYCL/Matrix/joint_matrix_tensorcores.cpp index 8a5a1ae82d..2f29bb0667 100644 --- a/SYCL/Matrix/joint_matrix_tensorcores.cpp +++ b/SYCL/Matrix/joint_matrix_tensorcores.cpp @@ -1,7 +1,5 @@ // REQUIRES: cuda -// Temp xfail: test was merged early. -// XFAIL: cuda // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 %s -o %t.out // RUN: %t.out // @@ -14,7 +12,7 @@ #include using namespace sycl; -using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi; using namespace sycl::ext::oneapi::experimental::matrix; constexpr float bf16_eps = 0.00390625; @@ -146,9 +144,11 @@ void test(queue &q) { // column id of current submatrix of BIG C matrix const auto n = item.get_group().get_group_id()[1]; - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix, use::accumulator, M, N> sub_c; + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix, use::accumulator, + M, N> + sub_c; joint_matrix_load(sg, sub_c, accC.get_pointer() + (m * M) * Big_N + n * N, @@ -165,11 +165,13 @@ void test(queue &q) { // round values to correct precision if using tf32 if constexpr (std::is_same::value) { - auto wi_size = sub_a.wi_marray.size(); - assert(wi_size == sub_b.wi_marray.size()); + auto wi_size = get_wi_data(sg, sub_a).length(); + assert(wi_size == get_wi_data(sg, sub_b).length()); for (auto i = 0; i < wi_size; ++i) { - sub_a.wi_marray[i] = round_to_tf32(sub_a.wi_marray[i]); - sub_b.wi_marray[i] = round_to_tf32(sub_b.wi_marray[i]); + get_wi_data(sg, sub_a)[i] = + round_to_tf32(get_wi_data(sg, sub_a)[i]); + get_wi_data(sg, sub_b)[i] = + round_to_tf32(get_wi_data(sg, sub_b)[i]); } }