Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][E2E][Joint Matrix] OOB tests to support more shapes, layouts #16837

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions sycl/test-e2e/Matrix/Inputs/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,10 @@ void matrix_print(unsigned int rows, unsigned int cols, T *mat) {
std::cout << "\n";
}
}

template <typename T, layout Layout> constexpr int vnni_factor() {
if constexpr (Layout != layout::ext_intel_packed)
return 1;
static_assert(sizeof(T) <= 4 && "Unsupported type in vnni_factor().");
return 4 / sizeof(T);
}
159 changes: 104 additions & 55 deletions sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,20 @@
#include <iostream>
#include <sycl/usm.hpp>

constexpr size_t TM = 8;
constexpr size_t TK = 16;
template <typename Tab, size_t K, layout B_layout> class mult;

template <layout B_layout, unsigned int vnniFactor> class mult;

template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
size_t NUM_COLS_C, layout B_layout, unsigned int vnniFactor>
template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
size_t TN, size_t TK, layout A_layout, layout B_layout>
void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
size_t M = NUM_ROWS_C;
size_t N = NUM_COLS_C;
size_t K = NUM_COLS_A;

assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor);
// Add one iteration for the out of bounds dpas instruction
size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
size_t NDRangeN = N / TN;
size_t sg_size = get_sg_size<mult<B_layout, vnniFactor>>(q);
size_t sg_size = get_sg_size<mult<T2, K, B_layout>>(q);
std::cout << "SG size: " << sg_size << " ";

q.submit([&](handler &cgh) {
cgh.parallel_for<mult<B_layout, vnniFactor>>(
cgh.parallel_for<mult<T2, K, B_layout>>(
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
Expand All @@ -45,6 +38,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
auto pC =
address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(C);

// The submatrix API has to be accessed by all the workitems in a
// subgroup these functions will be called once by the subgroup no
// code divergence between the workitems
Expand All @@ -54,27 +48,41 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
sub_a;

// For B, since current implementation does not support non-packed
// layout, users need to specify the packed_b layout.
joint_matrix<sub_group, bfloat16, use::b, TK, TN, B_layout> sub_b;
joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
// bounds-checked load where width and height are added
joint_matrix<sub_group, T2, use::a, TM, TK, A_layout> sub_a;
joint_matrix<sub_group, T2, use::b, TK, TN, B_layout> sub_b;
joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;

// bounds-checked fill where width and height are added
ext::intel::experimental::matrix::joint_matrix_fill_checked(
YuriPlyakhin marked this conversation as resolved.
Show resolved Hide resolved
sg, sub_c, 1, M, N, sg_startx * TM, sg_starty / sg_size * TN);

for (int k = 0; k < K; k += TK) {
// bounds-checked load where width and height are added
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_a, pA, K, M, K, sg_startx * TM, k);
// Assume we alreay in vnni format.
// params order: Stride, Height, Width, CoordX, CoordY
if constexpr (A_layout == layout::row_major) {
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_a, pA, K, M, K, sg_startx * TM, k);
} else {
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_a, pA, M, K, M, k, sg_startx * TM);
}

// bounds-checked load where width and height are added
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor,
k / vnniFactor, sg_starty / sg_size * TN * vnniFactor);
// params order: Stride, Height, Width, CoordX, CoordY
if constexpr (B_layout != layout::col_major) {
constexpr unsigned int vnniFactor = vnni_factor<T2, B_layout>();
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_b, pB, N * vnniFactor, K / vnniFactor,
N * vnniFactor, k / vnniFactor,
sg_starty / sg_size * TN * vnniFactor);
} else {
ext::intel::experimental::matrix::joint_matrix_load_checked(
sg, sub_b, pB, K, N, K, sg_starty / sg_size * TN, k);
}

joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
}

// bounds-checked store where width and height are added
ext::intel::experimental::matrix::joint_matrix_store_checked(
sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM,
Expand All @@ -83,42 +91,83 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
}).wait();
}

int main() {
static constexpr size_t MATRIX_M = 1024 + 14;
static constexpr size_t MATRIX_N = 1024;
static constexpr unsigned int vnniFactor = 2;

template <typename Tab, typename Tc, size_t MATRIX_M, size_t MATRIX_N,
size_t MATRIX_K, size_t TM, size_t TN, size_t TK, layout A_layout,
layout B_layout>
void test() {
std::cout << MATRIX_M << "x" << MATRIX_N << "x" << MATRIX_K << ", " << TM
<< "x" << TN << "x" << TK << ": ";
queue q;
bfloat16 *A = malloc_shared<bfloat16>(MATRIX_M * MATRIX_K, q);
bfloat16 *B = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
bfloat16 *vnniB = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
float *C = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
float *D = malloc_shared<float>(MATRIX_M * MATRIX_N, q);

matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5);
matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5);
matrix_fill(MATRIX_M, MATRIX_N, C, (float)1);
matrix_fill(MATRIX_M, MATRIX_N, D, (float)1);

matrix_vnni<bfloat16>(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor);

// reference data
Tab *A = malloc_shared<Tab>(MATRIX_M * MATRIX_K, q);
Tab *B = malloc_shared<Tab>(MATRIX_K * MATRIX_N, q);
Tc *C = malloc_shared<Tc>(MATRIX_M * MATRIX_N, q);
Tc *D = malloc_shared<Tc>(MATRIX_M * MATRIX_N, q);
matrix_rand(MATRIX_M, MATRIX_K, A, (Tab)5);
matrix_rand(MATRIX_K, MATRIX_N, B, (Tab)5);
matrix_fill(MATRIX_M, MATRIX_N, D, (Tc)1);
matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K);
matrix_multiply<float, bfloat16, MATRIX_M, MATRIX_K, MATRIX_K / vnniFactor,
MATRIX_N * vnniFactor, MATRIX_M, MATRIX_N,
layout::ext_intel_packed, vnniFactor>(C, A, vnniB, q);
bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D);

matrix_multiply<float, bfloat16, MATRIX_M, MATRIX_K, MATRIX_K, MATRIX_N,
MATRIX_M, MATRIX_N, layout::row_major, 1>(C, A, B, q);
res = res && matrix_compare(MATRIX_M, MATRIX_N, C, D);

std::cout << (res ? "passed" : "failed") << std::endl;
// test data
if constexpr (A_layout == layout::col_major) {
Tab *colA = malloc_shared<Tab>(MATRIX_K * MATRIX_M, q);
matrix_transpose(MATRIX_M, MATRIX_K, colA, A);
Tab *tmp = A;
A = colA;
free(tmp, q);
}

if constexpr (B_layout == layout::col_major) {
Tab *colB = malloc_shared<Tab>(MATRIX_N * MATRIX_K, q);
matrix_transpose(MATRIX_K, MATRIX_N, colB, B);
Tab *tmp = B;
B = colB;
free(tmp, q);
}

if constexpr (B_layout == layout::ext_intel_packed) {
Tab *vnniB = malloc_shared<Tab>(MATRIX_K * MATRIX_N, q);
matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnni_factor<Tab, B_layout>());
Tab *tmp = B;
B = vnniB;
free(tmp, q);
}

matrix_multiply<Tc, Tab, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK, A_layout,
B_layout>(C, A, B, q);
assert(matrix_compare(MATRIX_M, MATRIX_N, C, D));
std::cout << "passed" << std::endl;

free(A, q);
free(B, q);
free(vnniB, q);
free(C, q);
free(D, q);
}

return !res;
template <layout A_layout, layout B_layout> void test_all() {
std::cout << "bf16: ";
test<bfloat16, float, 1024 + 14, 1024, 1024 + 24, 8, 16, 16, A_layout,
YuriPlyakhin marked this conversation as resolved.
Show resolved Hide resolved
B_layout>();
std::cout << "half: ";
test<half, float, 1024 + 14, 1024, 1024 + 24, 8, 16, 16, A_layout,
B_layout>();
std::cout << "int8: ";
test<int8_t, int32_t, 1024 + 14, 1024, 1024 + 24, 8, 16, 32, A_layout,
B_layout>();

// unaligned k:
std::cout << "bf16: ";
test<bfloat16, float, 1024 + 14, 1024, 1024 + 14, 8, 16, 16, A_layout,
B_layout>();
std::cout << "half: ";
test<half, float, 1024 + 14, 1024, 1024 + 14, 8, 16, 16, A_layout,
B_layout>();

// row major A fails, so disabled. CMPLRLLVM-65239
if constexpr (A_layout != layout::row_major) {
std::cout << "int8: ";
test<int8_t, int32_t, 1024 + 14, 1024, 1024 + 14, 8, 16, 32, A_layout,
B_layout>();
}
}
11 changes: 7 additions & 4 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
// XFAIL-TRACKER: GSD-4181

#include "common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;
constexpr size_t MATRIX_K = 1024 + 24;

#include "joint_matrix_out_bounds_impl.hpp"

int main() {
std::cout << "A row major, B row major:\n";
test_all<layout::row_major, layout::row_major>();
std::cout << "A row major, B packed:\n";
test_all<layout::row_major, layout::ext_intel_packed>();
}
11 changes: 7 additions & 4 deletions sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
// RUN: %{run} %t.out

#include "common.hpp"

constexpr size_t TN = 16;
constexpr size_t MATRIX_K = 1024 + 24;

#include "joint_matrix_out_bounds_impl.hpp"

int main() {
std::cout << "A row major, B row major:\n";
test_all<layout::row_major, layout::row_major>();
std::cout << "A row major, B packed:\n";
test_all<layout::row_major, layout::ext_intel_packed>();
YuriPlyakhin marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix-------------==//
//==----joint_matrix_out_bounds_colmajor.cpp - DPC++ joint_matrix---------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix
// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943

// UNSUPPORTED: gpu-intel-dg2, cpu

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// RUN: %{build} -o %t32.out -DSG_SZ=32
// RUN: %{run} %t32.out

// XFAIL:gpu
// XFAIL-TRACKER: GSD-4181
// XFAIL-TRACKER: GSD-5768

#include "common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;
static constexpr size_t MATRIX_K = 1024 + 14;

#include "joint_matrix_out_bounds_impl.hpp"

int main() {
std::cout << "A col major, B col major:\n";
test_all<layout::col_major, layout::col_major>();
}
20 changes: 0 additions & 20 deletions sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp

This file was deleted.

Loading