diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp
index dca215ae574d2..371f705bae07f 100644
--- a/sycl/test-e2e/Matrix/Inputs/common.hpp
+++ b/sycl/test-e2e/Matrix/Inputs/common.hpp
@@ -234,3 +234,10 @@ void matrix_print(unsigned int rows, unsigned int cols, T *mat) {
     std::cout << "\n";
   }
 }
+
+template <typename T, layout Layout> constexpr int vnni_factor() {
+  if constexpr (Layout != layout::ext_intel_packed)
+    return 1;
+  static_assert(sizeof(T) <= 4 && "Unsupported type in vnni_factor().");
+  return 4 / sizeof(T);
+}
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
index d8c02b2dc36fb..0f855a8a1dccd 100644
--- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
+++ b/sycl/test-e2e/Matrix/Inputs/joint_matrix_out_bounds_impl.hpp
@@ -9,27 +9,20 @@
 #include <iostream>
 #include <sycl/usm.hpp>
 
-constexpr size_t TM = 8;
-constexpr size_t TK = 16;
+template <typename Tab, size_t K, layout B_layout> class mult;
 
-template <layout B_layout, unsigned int vnniFactor> class mult;
-
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
-          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
-          size_t NUM_COLS_C, layout B_layout, unsigned int vnniFactor>
+template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
+          size_t TN, size_t TK, layout A_layout, layout B_layout>
 void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
-  size_t M = NUM_ROWS_C;
-  size_t N = NUM_COLS_C;
-  size_t K = NUM_COLS_A;
 
-  assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * vnniFactor);
   // Add one iteration for the out of bounds dpas instruction
   size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
-  size_t NDRangeN = N / TN;
-  size_t sg_size = get_sg_size<mult<B_layout, vnniFactor>>(q);
+  size_t NDRangeN = N / TN + (((N % TN) != 0) ? 1 : 0);
+  size_t sg_size = get_sg_size<mult<T2, K, B_layout>>(q);
+  std::cout << "SG size: " << sg_size << " ";
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<mult<B_layout, vnniFactor>>(
+     cgh.parallel_for<mult<T2, K, B_layout>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -45,6 +38,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
            auto pC =
                address_space_cast<sycl::access::address_space::global_space,
                                   sycl::access::decorated::no>(C);
+
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -54,27 +48,41 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
-               sub_a;
-
-           // For B, since current implementation does not support non-packed
-           // layout, users need to specify the packed_b layout.
-           joint_matrix<sub_group, bfloat16, use::b, TK, TN, B_layout> sub_b;
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
-           // bounds-checked load where width and height are added
+           joint_matrix<sub_group, T2, use::a, TM, TK, A_layout> sub_a;
+           joint_matrix<sub_group, T2, use::b, TK, TN, B_layout> sub_b;
+           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;
+
+           // bounds-checked fill where width and height are added
            ext::intel::experimental::matrix::joint_matrix_fill_checked(
                sg, sub_c, 1, M, N, sg_startx * TM, sg_starty / sg_size * TN);
+
            for (int k = 0; k < K; k += TK) {
              // bounds-checked load where width and height are added
-             ext::intel::experimental::matrix::joint_matrix_load_checked(
-                 sg, sub_a, pA, K, M, K, sg_startx * TM, k);
-             // Assume we alreay in vnni format.
+             // params order: Stride, Height, Width, CoordX, CoordY
+             if constexpr (A_layout == layout::row_major) {
+               ext::intel::experimental::matrix::joint_matrix_load_checked(
+                   sg, sub_a, pA, K, M, K, sg_startx * TM, k);
+             } else {
+               ext::intel::experimental::matrix::joint_matrix_load_checked(
+                   sg, sub_a, pA, M, K, M, k, sg_startx * TM);
+             }
+
              // bounds-checked load where width and height are added
-             ext::intel::experimental::matrix::joint_matrix_load_checked(
-                 sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor,
-                 k / vnniFactor, sg_starty / sg_size * TN * vnniFactor);
+             // params order: Stride, Height, Width, CoordX, CoordY
+             if constexpr (B_layout != layout::col_major) {
+               constexpr unsigned int vnniFactor = vnni_factor<T2, B_layout>();
+               ext::intel::experimental::matrix::joint_matrix_load_checked(
+                   sg, sub_b, pB, N * vnniFactor, K / vnniFactor,
+                   N * vnniFactor, k / vnniFactor,
+                   sg_starty / sg_size * TN * vnniFactor);
+             } else {
+               ext::intel::experimental::matrix::joint_matrix_load_checked(
+                   sg, sub_b, pB, K, N, K, sg_starty / sg_size * TN, k);
+             }
+
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
+
            // bounds-checked store where width and height are added
            ext::intel::experimental::matrix::joint_matrix_store_checked(
                sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM,
@@ -83,42 +91,69 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
    }).wait();
 }
 
-int main() {
-  static constexpr size_t MATRIX_M = 1024 + 14;
-  static constexpr size_t MATRIX_N = 1024;
-  static constexpr unsigned int vnniFactor = 2;
-
+template <typename Tab, typename Tc, size_t MATRIX_M, size_t MATRIX_N,
+          size_t MATRIX_K, size_t TM, size_t TN, size_t TK, layout A_layout,
+          layout B_layout>
+void test() {
+  std::cout << MATRIX_M << "x" << MATRIX_N << "x" << MATRIX_K << ", " << TM
+            << "x" << TN << "x" << TK << ": ";
   queue q;
-  bfloat16 *A = malloc_shared<bfloat16>(MATRIX_M * MATRIX_K, q);
-  bfloat16 *B = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
-  bfloat16 *vnniB = malloc_shared<bfloat16>(MATRIX_K * MATRIX_N, q);
-  float *C = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *D = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-
-  matrix_rand(MATRIX_M, MATRIX_K, A, (bfloat16)5);
-  matrix_rand(MATRIX_K, MATRIX_N, B, (bfloat16)5);
-  matrix_fill(MATRIX_M, MATRIX_N, C, (float)1);
-  matrix_fill(MATRIX_M, MATRIX_N, D, (float)1);
-
-  matrix_vnni<bfloat16>(MATRIX_K, MATRIX_N, B, vnniB, vnniFactor);
 
+  // reference data
+  Tab *A = malloc_shared<Tab>(MATRIX_M * MATRIX_K, q);
+  Tab *B = malloc_shared<Tab>(MATRIX_K * MATRIX_N, q);
+  Tc *C = malloc_shared<Tc>(MATRIX_M * MATRIX_N, q);
+  Tc *D = malloc_shared<Tc>(MATRIX_M * MATRIX_N, q);
+  matrix_rand(MATRIX_M, MATRIX_K, A, (Tab)5);
+  matrix_rand(MATRIX_K, MATRIX_N, B, (Tab)5);
+  matrix_fill(MATRIX_M, MATRIX_N, D, (Tc)1);
   matrix_multiply_ref(A, B, D, MATRIX_M, MATRIX_N, MATRIX_K);
-  matrix_multiply<float, bfloat16, MATRIX_M, MATRIX_K, MATRIX_K / vnniFactor,
-                  MATRIX_N * vnniFactor, MATRIX_M, MATRIX_N,
-                  layout::ext_intel_packed, vnniFactor>(C, A, vnniB, q);
-  bool res = matrix_compare(MATRIX_M, MATRIX_N, C, D);
-
-  matrix_multiply<float, bfloat16, MATRIX_M, MATRIX_K, MATRIX_K, MATRIX_N,
-                  MATRIX_M, MATRIX_N, layout::row_major, 1>(C, A, B, q);
-  res = res && matrix_compare(MATRIX_M, MATRIX_N, C, D);
 
-  std::cout << (res ? "passed" : "failed") << std::endl;
+  // test data
+  if constexpr (A_layout == layout::col_major) {
+    Tab *colA = malloc_shared<Tab>(MATRIX_K * MATRIX_M, q);
+    matrix_transpose(MATRIX_M, MATRIX_K, colA, A);
+    Tab *tmp = A;
+    A = colA;
+    free(tmp, q);
+  }
+
+  if constexpr (B_layout == layout::col_major) {
+    Tab *colB = malloc_shared<Tab>(MATRIX_N * MATRIX_K, q);
+    matrix_transpose(MATRIX_K, MATRIX_N, colB, B);
+    Tab *tmp = B;
+    B = colB;
+    free(tmp, q);
+  }
+
+  if constexpr (B_layout == layout::ext_intel_packed) {
+    Tab *vnniB = malloc_shared<Tab>(MATRIX_K * MATRIX_N, q);
+    matrix_vnni(MATRIX_K, MATRIX_N, B, vnniB, vnni_factor<Tab, B_layout>());
+    Tab *tmp = B;
+    B = vnniB;
+    free(tmp, q);
+  }
+
+  matrix_multiply<Tc, Tab, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK, A_layout,
+                  B_layout>(C, A, B, q);
+  assert(matrix_compare(MATRIX_M, MATRIX_N, C, D));
+  std::cout << "passed" << std::endl;
 
   free(A, q);
   free(B, q);
-  free(vnniB, q);
   free(C, q);
   free(D, q);
+}
 
-  return !res;
+template <layout A_layout, layout B_layout> void test_all() {
+  std::cout << "bf16: ";
+  test<bfloat16, float, /*MATRIX_M*/ 1024 + 20, /*MATRIX_N*/ 1024 + 20,
+       /*MATRIX_K*/ 1024 + 24, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, A_layout,
+       B_layout>();
+  std::cout << "half: ";
+  test<half, float, 1024 + 20, 1024 + 20, 1024 + 24, 8, 16, 16, A_layout,
+       B_layout>();
+  std::cout << "int8: ";
+  test<int8_t, int32_t, 1024, 1024 + 20, 1024 + 24, 8, 16, 32, A_layout,
+       B_layout>();
 }
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
index 732ddc84243cd..e79e329e5a259 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
@@ -9,6 +9,7 @@
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // UNSUPPORTED: gpu-intel-dg2, cpu
+// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
@@ -17,9 +18,12 @@
 // XFAIL-TRACKER: GSD-4181
 
 #include "common.hpp"
-
 #define SG_SZ 32
-constexpr size_t TN = 16;
-constexpr size_t MATRIX_K = 1024 + 24;
-
 #include "joint_matrix_out_bounds_impl.hpp"
+
+int main() {
+  std::cout << "A row major, B row major:\n";
+  test_all<layout::row_major, layout::row_major>();
+  std::cout << "A row major, B packed:\n";
+  test_all<layout::row_major, layout::ext_intel_packed>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
index c8881c4edfc94..f71d091c6cbd8 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
@@ -8,13 +8,17 @@
 // REQUIRES: aspect-ext_intel_matrix
 
 // UNSUPPORTED: gpu-intel-dg2, cpu
+// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-constexpr size_t MATRIX_K = 1024 + 24;
-
 #include "joint_matrix_out_bounds_impl.hpp"
+
+int main() {
+  std::cout << "A row major, B row major:\n";
+  test_all<layout::row_major, layout::row_major>();
+  std::cout << "A row major, B packed:\n";
+  test_all<layout::row_major, layout::ext_intel_packed>();
+}
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp
similarity index 57%
rename from sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
rename to sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp
index fa88c72729a40..51c10c44642a5 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_colmajor.cpp
@@ -1,4 +1,4 @@
-//==-------- joint_matrix_unaligned_k.cpp  - DPC++ joint_matrix-------------==//
+//==----joint_matrix_out_bounds_colmajor.cpp  - DPC++ joint_matrix---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,20 +6,23 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-ext_intel_matrix
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // UNSUPPORTED: gpu-intel-dg2, cpu
+// UNSUPPORTED-INTENDED: Checked load/stores are not supported by DG2 and CPU HW
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// RUN: %{build} -o %t32.out -DSG_SZ=32
+// RUN: %{run} %t32.out
+
 // XFAIL:gpu
-// XFAIL-TRACKER: GSD-4181
+// XFAIL-TRACKER: GSD-5768
 
 #include "common.hpp"
-
-#define SG_SZ 32
-constexpr size_t TN = 16;
-static constexpr size_t MATRIX_K = 1024 + 14;
-
 #include "joint_matrix_out_bounds_impl.hpp"
+
+int main() {
+  std::cout << "A col major, B col major:\n";
+  test_all<layout::col_major, layout::col_major>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
deleted file mode 100644
index e9e6c977b55f7..0000000000000
--- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==-------- joint_matrix_unaligned_k.cpp  - DPC++ joint_matrix-------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: aspect-ext_intel_matrix
-
-// UNSUPPORTED: gpu-intel-dg2, cpu
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "common.hpp"
-
-constexpr size_t TN = 16;
-static constexpr size_t MATRIX_K = 1024 + 14;
-
-#include "joint_matrix_out_bounds_impl.hpp"
diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
index f95323d179fb0..8b826d58a8dab 100644
--- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
+++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
@@ -54,7 +54,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 375
+// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 371
 //
 // List of improperly UNSUPPORTED tests.
 // Remove the CHECK once the test has been properly UNSUPPORTED.
@@ -273,22 +273,18 @@
 // CHECK-NEXT: Matrix/SG32/joint_matrix_down_convert.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_half.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_out_bounds.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_prefetch.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_ss_int8.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_su_int8.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_transposeC.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_unaligned_k.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_us_int8.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_uu_int8.cpp
 // CHECK-NEXT: Matrix/joint_matrix_annotated_ptr.cpp
 // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
 // CHECK-NEXT: Matrix/joint_matrix_bf16_fill_k_cache_prefetch.cpp
 // CHECK-NEXT: Matrix/joint_matrix_down_convert.cpp
-// CHECK-NEXT: Matrix/joint_matrix_out_bounds.cpp
 // CHECK-NEXT: Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
-// CHECK-NEXT: Matrix/joint_matrix_unaligned_k.cpp
 // CHECK-NEXT: NewOffloadDriver/aot-gpu.cpp
 // CHECK-NEXT: NewOffloadDriver/spirv_device_obj_smoke.cpp
 // CHECK-NEXT: NonUniformGroups/ballot_group.cpp