intel · romanovvlad · Nov 8, 2021 · Oct 4, 2021 · Oct 4, 2021 · Oct 5, 2021
@@ -0,0 +1,274 @@
+#pragma once
+
+#include <CL/sycl/detail/defines_elementary.hpp>
+#include <immintrin.h>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace intel {
+namespace experimental::matrix {
+
+enum class matrix_type { a, b, accumulator };
+
+enum class matrix_layout { row_major, col_major, packed };
+
+template <typename Group, typename T, matrix_type MT,
+          size_t Rows = sycl::dynamic_extent,
+          size_t Cols = sycl::dynamic_extent,
+          matrix_layout Layout = matrix_layout::row_major, typename Cond = void>
+struct joint_matrix {
+  joint_matrix(Group g) {}
+};
+
+// The enable_if_t usage in this file is used to disable the
+// matrix_layout::packed case which is not compatible with the Nvidia cuda
+// backend.
+template <matrix_layout Layout>
+struct joint_matrix<
+    sycl::sub_group, double, matrix_type::a, 8, 4, Layout,
+    typename std::enable_if_t<Layout == matrix_layout::row_major ||
+                              Layout == matrix_layout::col_major>> {
+  double data[1];
+};
+
+template <matrix_layout Layout>
+struct joint_matrix<
+    sycl::sub_group, double, matrix_type::b, 4, 8, Layout,
+    typename std::enable_if_t<(Layout == matrix_layout::row_major ||
+                               Layout == matrix_layout::col_major)>> {
+  double data[1];
+};
+
+template <matrix_layout Layout>
+struct joint_matrix<
+    sycl::sub_group, double, matrix_type::accumulator, 8, 8, Layout,
+    typename std::enable_if_t<Layout == matrix_layout::row_major ||
+                              Layout == matrix_layout::col_major>> {
+  double data[2];
+};
+
+} // namespace experimental::matrix
+
+namespace detail {
+using namespace experimental;
+
+template <typename Group, typename T, matrix::matrix_type MT, size_t NumRows,
+          size_t NumCols, matrix::matrix_layout Layout,
+          access::address_space Space, typename Cond = void>
+struct joint_matrix_load_impl {
+  void load(matrix::joint_matrix<Group, T, MT, NumRows, NumCols, Layout> &res,
+            multi_ptr<T, Space> src, size_t stride);
+};
+
+template <matrix::matrix_layout Layout> constexpr int get_layout_id();
+
+template <> constexpr int get_layout_id<matrix::matrix_layout::row_major>() {
+  return 0;
+}
+
+template <> constexpr int get_layout_id<matrix::matrix_layout::col_major>() {
+  return 1;
+}
+
+template <matrix::matrix_layout Layout, access::address_space Space>
+struct joint_matrix_load_impl<
+    sycl::sub_group, double, matrix::matrix_type::a, 8, 4, Layout, Space,
+    typename std::enable_if_t<Layout == matrix::matrix_layout::row_major ||
+                              Layout == matrix::matrix_layout::col_major>> {
+  void load(matrix::joint_matrix<sycl::sub_group, double,
+                                 matrix::matrix_type::a, 8, 4, Layout> &res,
+            multi_ptr<double, Space> src, size_t stride) {
+
+#ifdef __NVPTX__
+#ifdef __SYCL_DEVICE_ONLY__
+    __dmma_m8n8k4_ld_a(res.data, src.get(), stride, get_layout_id<Layout>());
+#endif
+#endif
+  }
+};
+
+template <matrix::matrix_layout Layout, access::address_space Space>
+struct joint_matrix_load_impl<
+    sycl::sub_group, double, matrix::matrix_type::b, 4, 8, Layout, Space,
+    typename std::enable_if_t<Layout == matrix::matrix_layout::row_major ||
+                              Layout == matrix::matrix_layout::col_major>> {
+  void load(matrix::joint_matrix<sycl::sub_group, double,
+                                 matrix::matrix_type::b, 4, 8, Layout> &res,
+            multi_ptr<double, Space> src, size_t stride) {
+#ifdef __NVPTX__
+#ifdef __SYCL_DEVICE_ONLY__
+    __dmma_m8n8k4_ld_b(res.data, src.get(), stride, get_layout_id<Layout>());
+#endif
+#endif
+  }
+};
+
+template <matrix::matrix_layout Layout, access::address_space Space>
+struct joint_matrix_load_impl<
+    sycl::sub_group, double, matrix::matrix_type::accumulator, 8, 8, Layout,
+    Space,
+    typename std::enable_if_t<Layout == matrix::matrix_layout::row_major ||
+                              Layout == matrix::matrix_layout::col_major>> {
+  void load(
+      matrix::joint_matrix<sycl::sub_group, double,
+                           matrix::matrix_type::accumulator, 8, 8, Layout> &res,
+      multi_ptr<double, Space> src, size_t stride) {
+
+#ifdef __NVPTX__
+#ifdef __SYCL_DEVICE_ONLY__
+    __dmma_m8n8k4_ld_c(res.data, src.get(), stride, get_layout_id<Layout>());
+#endif
+#endif
+  }
+};
+
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix::matrix_layout Layout, access::address_space Space,
+          typename Cond = void>
+struct joint_matrix_store_impl {
+  void store(matrix::joint_matrix<Group, T, matrix::matrix_type::accumulator,
+                                  NumRows, NumCols, Layout> &src,
+             multi_ptr<T, Space> dst, size_t stride);
+};
+
+template <matrix::matrix_layout Layout, access::address_space Space>
+struct joint_matrix_store_impl<
+    sycl::sub_group, double, 8, 8, Layout, Space,
+    typename std::enable_if_t<Layout == matrix::matrix_layout::row_major ||
+                              Layout == matrix::matrix_layout::col_major>> {
+  void store(
+      matrix::joint_matrix<sycl::sub_group, double,
+                           matrix::matrix_type::accumulator, 8, 8, Layout> &src,
+      multi_ptr<double, Space> dst, size_t stride) {
+
+#ifdef __NVPTX__
+#ifdef __SYCL_DEVICE_ONLY__
+    __dmma_m8n8k4_st_c_f64(dst.get(), src.data, stride,
+                           get_layout_id<Layout>());
+#endif
+#endif
+  }
+};
+
+template <typename Group, typename T1, typename T2, std::size_t M,
+          std::size_t K, std::size_t N, matrix::matrix_layout LayoutA,
+          matrix::matrix_layout LayoutB, matrix::matrix_layout LayoutC,
+          typename Cond = void>
+struct joint_matrix_mad_impl {
+  matrix::joint_matrix<Group, T2, matrix::matrix_type::accumulator, M, N,
+                       LayoutC>
+  mad(Group sg,
+      matrix::joint_matrix<Group, T1, matrix::matrix_type::a, M, K, LayoutA> A,
+      matrix::joint_matrix<Group, T1, matrix::matrix_type::b, K, N, LayoutB> B,
+      matrix::joint_matrix<Group, T2, matrix::matrix_type::accumulator, M, N,
+                           LayoutC>
+          C);
+};
+
+template <matrix::matrix_layout LayoutA, matrix::matrix_layout LayoutB>
+constexpr int get_layout_pair_id();
+
+template <>
+constexpr int get_layout_pair_id<matrix::matrix_layout::row_major,
+                                 matrix::matrix_layout::row_major>() {
+  return 0;
+}
+
+template <>
+constexpr int get_layout_pair_id<matrix::matrix_layout::row_major,
+                                 matrix::matrix_layout::col_major>() {
+  return 1;
+}
+
+template <>
+constexpr int get_layout_pair_id<matrix::matrix_layout::col_major,
+                                 matrix::matrix_layout::row_major>() {
+  return 2;
+}
+
+template <>
+constexpr int get_layout_pair_id<matrix::matrix_layout::col_major,
+                                 matrix::matrix_layout::col_major>() {
+  return 3;
+}
+
+template <matrix::matrix_layout LayoutA, matrix::matrix_layout LayoutB,
+          matrix::matrix_layout LayoutC>
+struct joint_matrix_mad_impl<
+    sycl::sub_group, double, double, 8, 4, 8, LayoutA, LayoutB, LayoutC,
+    typename std::enable_if_t<(LayoutA == matrix::matrix_layout::row_major ||
+                               LayoutA == matrix::matrix_layout::col_major) &&
+                              (LayoutB == matrix::matrix_layout::row_major ||
+                               LayoutB == matrix::matrix_layout::col_major) &&
+                              (LayoutC == matrix::matrix_layout::row_major ||
+                               LayoutC == matrix::matrix_layout::col_major)>> {
+  matrix::joint_matrix<sycl::sub_group, double,
+                       matrix::matrix_type::accumulator, 8, 8, LayoutC>
+  mad(sycl::sub_group sg,
+      matrix::joint_matrix<sycl::sub_group, double, matrix::matrix_type::a, 8,
+                           4, LayoutA>
+          A,
+      matrix::joint_matrix<sycl::sub_group, double, matrix::matrix_type::b, 4,
+                           8, LayoutB>
+          B,
+      matrix::joint_matrix<sycl::sub_group, double,
+                           matrix::matrix_type::accumulator, 8, 8, LayoutC>
+          C) {
+    matrix::joint_matrix<sycl::sub_group, double,
+                         matrix::matrix_type::accumulator, 8, 8, LayoutC>
+        D;
+
+#ifdef __NVPTX__
+#ifdef __SYCL_DEVICE_ONLY__
+    __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
+                          get_layout_pair_id<LayoutA, LayoutB>(), 0);
+#endif
+#endif
+
+    return D;
+  }
+};
+
+} // namespace detail
+
+namespace experimental::matrix {
+
+template <typename Group, typename T, matrix_type MT, size_t NumRows,
+          size_t NumCols, matrix_layout Layout, access::address_space Space>
+void joint_matrix_load(
+    Group sg, joint_matrix<Group, T, MT, NumRows, NumCols, Layout> &res,
+    multi_ptr<T, Space> src, size_t stride) {
+  detail::joint_matrix_load_impl<Group, T, MT, NumRows, NumCols, Layout,
+                                 Space>{}
+      .load(res, src, stride);
+}
+
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          matrix_layout Layout, access::address_space Space>
+void joint_matrix_store(Group sg,
+                        joint_matrix<Group, T, matrix_type::accumulator,
+                                     NumRows, NumCols, Layout> &src,
+                        multi_ptr<T, Space> dst, size_t stride) {
+  detail::joint_matrix_store_impl<Group, T, NumRows, NumCols, Layout, Space>{}
+      .store(src, dst, stride);
+}
+
+template <typename Group, typename T1, typename T2, std::size_t M,
+          std::size_t K, std::size_t N, matrix_layout LayoutA,
+          matrix_layout LayoutB, matrix_layout LayoutC>
+joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC>
+joint_matrix_mad(
+    Group sg, joint_matrix<Group, T1, matrix_type::a, M, K, LayoutA> A,
+    joint_matrix<Group, T1, matrix_type::b, K, N, LayoutB> B,
+    joint_matrix<Group, T2, matrix_type::accumulator, M, N, LayoutC> C) {
+  return detail::joint_matrix_mad_impl<Group, T1, T2, M, K, N, LayoutA, LayoutB,
+                                       LayoutC>{}
+      .mad(sg, A, B, C);
+}
+
+} // namespace experimental::matrix
+} // namespace intel
+} // namespace ext
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
@@ -25,3 +25,6 @@
 #include <sycl/ext/oneapi/matrix/matrix-jit.hpp>
 #include <sycl/ext/oneapi/matrix/static-query.hpp>
 #endif
+#if (SYCL_EXT_ONEAPI_MATRIX == 3)
+#include <sycl/ext/oneapi/matrix/matrix-tensorcore.hpp>
+#endif