diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp index dca215ae574d2..371f705bae07f 100644 --- a/sycl/test-e2e/Matrix/Inputs/common.hpp +++ b/sycl/test-e2e/Matrix/Inputs/common.hpp @@ -234,3 +234,10 @@ void matrix_print(unsigned int rows, unsigned int cols, T *mat) { std::cout << "\n"; } } + +template constexpr int vnni_factor() { + if constexpr (Layout != layout::ext_intel_packed) + return 1; + static_assert(sizeof(T) <= 4 && "Unsupported type in vnni_factor()."); + return 4 / sizeof(T); +} diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp index d8c02b2dc36fb..0f855a8a1dccd 100644 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp +++ b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp @@ -9,27 +9,20 @@ #include #include -constexpr size_t TM = 8; -constexpr size_t TK = 16; +template class mult; -template class mult; - -template +template void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor); // Add one iteration for the out of bounds dpas instruction size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); - size_t NDRangeN = N / TN; - size_t sg_size = get_sg_size>(q); + size_t NDRangeN = N / TN + (((N % TN) != 0) ? 1 : 0); + size_t sg_size = get_sg_size>(q); + std::cout << "SG size: " << sg_size << " "; q.submit([&](handler &cgh) { - cgh.parallel_for>( + cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ @@ -45,6 +38,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { auto pC = address_space_cast(C); + // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -54,27 +48,41 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - - // For B, since current implementation does not support non-packed - // layout, users need to specify the packed_b layout. - joint_matrix sub_b; - joint_matrix sub_c; - // bounds-checked load where width and height are added + joint_matrix sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + + // bounds-checked fill where width and height are added ext::intel::experimental::matrix::joint_matrix_fill_checked( sg, sub_c, 1, M, N, sg_startx * TM, sg_starty / sg_size * TN); + for (int k = 0; k < K; k += TK) { // bounds-checked load where width and height are added - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, sub_a, pA, K, M, K, sg_startx * TM, k); - // Assume we alreay in vnni format. + // params order: Stride, Height, Width, CoordX, CoordY + if constexpr (A_layout == layout::row_major) { + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_a, pA, K, M, K, sg_startx * TM, k); + } else { + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_a, pA, M, K, M, k, sg_startx * TM); + } + // bounds-checked load where width and height are added - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor, - k / vnniFactor, sg_starty / sg_size * TN * vnniFactor); + // params order: Stride, Height, Width, CoordX, CoordY + if constexpr (B_layout != layout::col_major) { + constexpr unsigned int vnniFactor = vnni_factor(); + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_b, pB, N * vnniFactor, K / vnniFactor, + N * vnniFactor, k / vnniFactor, + sg_starty / sg_size * TN * vnniFactor); + } else { + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, sub_b, pB, K, N, K, sg_starty / sg_size * TN, k); + } + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } + // bounds-checked store where width and height are added ext::intel::experimental::matrix::joint_matrix_store_checked( sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM, @@ -83,42 +91,69 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { }).wait(); } -int main() { - static constexpr size_t MATRIX_M = 1024 + 14; - static constexpr size_t MATRIX_N = 1024; - static constexpr unsigned int vnniFactor = 2; - +template +void test() { + std::cout << MATRIX_M << "x" << MATRIX_N << "x" << MATRIX_K << ", " << TM + << "x" << TN << "x" << TK << ": "; queue q; - bfloat16 *A = malloc_shared(MATRIX_M * MATRIX_K, q); - bfloat16 *B = malloc_shared(MATRIX_K * MATRIX_N, q); - bfloat16 *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); - float *C = malloc_shared(MATRIX_M * MATRIX_N, q); - float *D = malloc_shared(MATRIX_M * MATRIX_N, q); - - matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5); - matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5); - matrix_fill(MATRIX_M, MATRIX_N, C, (float)1); - matrix_fill(MATRIX_M, MATRIX_N, D, (float)1); - - matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor); + // reference data + Tab *A = malloc_shared(MATRIX_M * MATRIX_K, q); + Tab *B = malloc_shared(MATRIX_K * MATRIX_N, q); + Tc *C = malloc_shared(MATRIX_M * MATRIX_N, q); + Tc *D = malloc_shared(MATRIX_M * MATRIX_N, q); + matrix_rand(MATRIX_M, MATRIX_K, A, (Tab)5); + matrix_rand(MATRIX_K, MATRIX_N, B, (Tab)5); + matrix_fill(MATRIX_M, MATRIX_N, D, (Tc)1); matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K); - matrix_multiply(C, A, vnniB, q); - bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D); - - matrix_multiply(C, A, B, q); - res = res && matrix_compare(MATRIX_M, MATRIX_N, C, D); - std::cout << (res ? "passed" : "failed") << std::endl; + // test data + if constexpr (A_layout == layout::col_major) { + Tab *colA = malloc_shared(MATRIX_K * MATRIX_M, q); + matrix_transpose(MATRIX_M, MATRIX_K, colA, A); + Tab *tmp = A; + A = colA; + free(tmp, q); + } + + if constexpr (B_layout == layout::col_major) { + Tab *colB = malloc_shared(MATRIX_N * MATRIX_K, q); + matrix_transpose(MATRIX_K, MATRIX_N, colB, B); + Tab *tmp = B; + B = colB; + free(tmp, q); + } + + if constexpr (B_layout == layout::ext_intel_packed) { + Tab *vnniB = malloc_shared(MATRIX_K * MATRIX_N, q); + matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnni_factor()); + Tab *tmp = B; + B = vnniB; + free(tmp, q); + } + + matrix_multiply(C, A, B, q); + assert(matrix_compare(MATRIX_M, MATRIX_N, C, D)); + std::cout << "passed" << std::endl; free(A, q); free(B, q); - free(vnniB, q); free(C, q); free(D, q); +} - return !res; +template void test_all() { + std::cout << "bf16: "; + test(); + std::cout << "half: "; + test(); + std::cout << "int8: "; + test(); } diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp index 732ddc84243cd..e79e329e5a259 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp @@ -9,6 +9,7 @@ // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // UNSUPPORTED: gpu-intel-dg2, cpu +// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -17,9 +18,12 @@ // XFAIL-TRACKER: GSD-4181 #include "common.hpp" - #define SG_SZ 32 -constexpr size_t TN = 16; -constexpr size_t MATRIX_K = 1024 + 24; - #include "joint_matrix_out_bounds_impl.hpp" + +int main() { + std::cout << "A row major, B row major:\n"; + test_all(); + std::cout << "A row major, B packed:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp index c8881c4edfc94..f71d091c6cbd8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp @@ -8,13 +8,17 @@ // REQUIRES: aspect-ext_intel_matrix // UNSUPPORTED: gpu-intel-dg2, cpu +// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW // RUN: %{build} -o %t.out // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t TN = 16; -constexpr size_t MATRIX_K = 1024 + 24; - #include "joint_matrix_out_bounds_impl.hpp" + +int main() { + std::cout << "A row major, B row major:\n"; + test_all(); + std::cout << "A row major, B packed:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp similarity index 57% rename from sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp rename to sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp index fa88c72729a40..51c10c44642a5 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp @@ -1,4 +1,4 @@ -//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==// +//==----joint_matrix_out_bounds_colmajor.cpp - DPC++ joint_matrix---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,20 +6,23 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // UNSUPPORTED: gpu-intel-dg2, cpu +// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %{build} -o %t32.out -DSG_SZ=32 +// RUN: %{run} %t32.out + // XFAIL:gpu -// XFAIL-TRACKER: GSD-4181 +// XFAIL-TRACKER: GSD-5768 #include "common.hpp" - -#define SG_SZ 32 -constexpr size_t TN = 16; -static constexpr size_t MATRIX_K = 1024 + 14; - #include "joint_matrix_out_bounds_impl.hpp" + +int main() { + std::cout << "A col major, B col major:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp deleted file mode 100644 index e9e6c977b55f7..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// UNSUPPORTED: gpu-intel-dg2, cpu - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "common.hpp" - -constexpr size_t TN = 16; -static constexpr size_t MATRIX_K = 1024 + 14; - -#include "joint_matrix_out_bounds_impl.hpp" diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index f95323d179fb0..8b826d58a8dab 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 375 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 371 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -273,22 +273,18 @@ // CHECK-NEXT: Matrix/SG32/joint_matrix_down_convert.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_half.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp -// CHECK-NEXT: Matrix/SG32/joint_matrix_out_bounds.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_prefetch.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_ss_int8.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_su_int8.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_transposeC.cpp -// CHECK-NEXT: Matrix/SG32/joint_matrix_unaligned_k.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_us_int8.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_uu_int8.cpp // CHECK-NEXT: Matrix/joint_matrix_annotated_ptr.cpp // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_prefetch.cpp // CHECK-NEXT: Matrix/joint_matrix_down_convert.cpp -// CHECK-NEXT: Matrix/joint_matrix_out_bounds.cpp // CHECK-NEXT: Matrix/joint_matrix_rowmajorA_rowmajorB.cpp -// CHECK-NEXT: Matrix/joint_matrix_unaligned_k.cpp // CHECK-NEXT: NewOffloadDriver/aot-gpu.cpp // CHECK-NEXT: NewOffloadDriver/spirv_device_obj_smoke.cpp // CHECK-NEXT: NonUniformGroups/ballot_group.cpp