From 51906febfc6cef6a2895de72287206e9b0c8119e Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Sun, 31 Mar 2024 23:54:06 -0700 Subject: [PATCH] [GPU] Fix gemm_tiled_opt kernel bug for tile_n_size 32 (#23776) ### Details: - Fixed crash and accuracy issue for n_tile_size 32 + transposed input for static shape - Fixed gemm_tiled_opt test to apply more various combinations & added more TCs ### Tickets: - 137358 --- .../cl_kernels/gemm_tiled_opt.cl | 16 +- .../tests/unit/test_cases/gemm_gpu_test.cpp | 232 ++++++++---------- 2 files changed, 109 insertions(+), 139 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 90dc0f07e0ffb0..c05ca859964ebb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -412,16 +412,12 @@ KERNEL(gemm_tiled_opt)( c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read[subtile_k_id], simd_local_id)), b_tile[subtile_k_id * SIMD_WIDTH + simd_local_id], c_tile[dot_id]); #else // TILE_K > SIMD_WIDTH - #if IS_DYNAMIC && B_VEC_SIZE > 1 - #if TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST + #if B_VEC_SIZE > 1 && TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST MAKE_VECTOR_TYPE(INPUT1_TYPE, B_VEC_SIZE) b_tile_tmp; unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile_tmp[b_elem] = b_tile[b_elem][simd_local_id]; } c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_local_id)), b_tile_tmp, c_tile[dot_id]); - #else - c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]); - #endif #else c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]); #endif @@ -464,7 +460,15 @@ KERNEL(gemm_tiled_opt)( // Tile C calculation for TN, TT cases unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { unroll_for (uint simd_local_id = 0; simd_local_id < SIMD_WIDTH; simd_local_id++) { - c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_tile[dot_id], simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]); + #if B_VEC_SIZE > 1 && TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST + MAKE_VECTOR_TYPE(INPUT1_TYPE, B_VEC_SIZE) b_tile_tmp; + unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { + b_tile_tmp[b_elem] = b_tile[b_elem][simd_local_id]; + } + c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_tile[dot_id], simd_local_id)), b_tile_tmp, c_tile[dot_id]); + #else + c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_tile[dot_id], simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]); + #endif } } // Tile C calculation for TN, TT cases end #endif // !TRANSPOSE_INPUT0 diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index 9220fad47269b4..b06a95f06ad295 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -1298,44 +1298,52 @@ class gemm_gpu_tests: public ::testing::Test { } } - void test_transpose_matmul_f32(size_t num_dims, bool is_input_dynamic, bool is_caching_test) { + void set_default_shapes(size_t num_dims, std::vector& BMKN, ov::Shape& input0_shape_default, ov::Shape& input1_shape_default, ov::Shape& output_shape_default) { + size_t BATCH_SIZE = BMKN[0]; + size_t M_SIZE = BMKN[1]; + size_t K_SIZE = BMKN[2]; + size_t N_SIZE = BMKN[3]; + if (num_dims == 1) { + input0_shape_default = { K_SIZE }; + input1_shape_default = { K_SIZE, N_SIZE }; + output_shape_default = { 1, N_SIZE }; + } else if (num_dims == 2) { + input0_shape_default = { M_SIZE, K_SIZE }; + input1_shape_default = { K_SIZE, N_SIZE }; + output_shape_default = { M_SIZE, N_SIZE }; + } else if (num_dims == 3) { + input0_shape_default = { BATCH_SIZE, M_SIZE, K_SIZE }; + input1_shape_default = { BATCH_SIZE, K_SIZE, N_SIZE }; + output_shape_default = { BATCH_SIZE, M_SIZE, N_SIZE }; + } else if (num_dims == 4) { + input0_shape_default = { BATCH_SIZE, 1, M_SIZE, K_SIZE}; + input1_shape_default = { BATCH_SIZE, 1, K_SIZE, N_SIZE}; + output_shape_default = { BATCH_SIZE, 1, M_SIZE, N_SIZE }; + } + } + + void test_transpose_matmul_f32(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector BMKN, std::vector input0_order, std::vector input1_order) { tests::random_generator rg; rg.set_seed(GET_SUITE_NAME); - const unsigned long BATCH_SIZE = 19; - const unsigned long M_SIZE = 37; - const unsigned long K_SIZE = 23; - const unsigned long N_SIZE = 29; - auto& engine = get_test_engine(); - ov::Shape input0_shape; - ov::Shape input1_shape; - std::vector input0_order; - std::vector input1_order; + ov::Shape input0_shape_default; + ov::Shape input1_shape_default; + ov::Shape output_shape_default; ov::Shape beam_table_shape; cldnn::layout input0_layout; cldnn::layout input1_layout; - if (num_dims == 1) { - input0_shape = { K_SIZE }; - input1_shape = { N_SIZE, K_SIZE }; - input0_order = { 0 }; - input1_order = { 1, 0 }; - } else if (num_dims == 2) { - input0_shape = { K_SIZE, M_SIZE }; - input1_shape = { N_SIZE, K_SIZE }; - input0_order = { 1, 0 }; - input1_order = { 1, 0 }; - } else if (num_dims == 3) { - input0_shape = { BATCH_SIZE, K_SIZE, M_SIZE }; - input1_shape = { N_SIZE, BATCH_SIZE, K_SIZE }; - input0_order = { 0, 2, 1 }; - input1_order = { 1, 2, 0 }; - } else if (num_dims == 4) { - input0_shape = { BATCH_SIZE, K_SIZE, 1, M_SIZE }; - input1_shape = { N_SIZE, BATCH_SIZE, 1, K_SIZE }; - input0_order = { 0, 2, 3, 1 }; - input1_order = { 1, 2, 3, 0 }; + set_default_shapes(num_dims, BMKN, input0_shape_default, input1_shape_default, output_shape_default); + ov::Shape input0_shape(input0_shape_default.size()); + ov::Shape input1_shape(input1_shape_default.size()); + + for (size_t dim = 0; dim < input0_shape_default.size(); ++dim) { + input0_shape[input0_order[dim]] = input0_shape_default[dim]; + } + + for (size_t dim = 0; dim < input1_shape_default.size(); ++dim) { + input1_shape[input1_order[dim]] = input1_shape_default[dim]; } if (is_input_dynamic) { @@ -1378,29 +1386,8 @@ class gemm_gpu_tests: public ::testing::Test { auto output_mem = outputs.at("gemm").get_memory(); cldnn::mem_lock output_ptr(output_mem, get_test_stream()); - ov::Shape ref_input0_shape; - ov::Shape ref_input1_shape; - ov::Shape ref_output_shape; - if (num_dims == 1) { - ref_input0_shape = { K_SIZE }; - ref_input1_shape = { K_SIZE, N_SIZE }; - ref_output_shape = { 1, N_SIZE }; - } else if (num_dims == 2) { - ref_input0_shape = { M_SIZE, K_SIZE }; - ref_input1_shape = { K_SIZE, N_SIZE }; - ref_output_shape = { M_SIZE, N_SIZE }; - } else if (num_dims == 3) { - ref_input0_shape = { BATCH_SIZE, M_SIZE, K_SIZE }; - ref_input1_shape = { BATCH_SIZE, K_SIZE, N_SIZE }; - ref_output_shape = { BATCH_SIZE, M_SIZE, N_SIZE }; - } else if (num_dims == 4) { - ref_input0_shape = { BATCH_SIZE, 1, M_SIZE, K_SIZE }; - ref_input1_shape = { BATCH_SIZE, 1, K_SIZE, N_SIZE }; - ref_output_shape = { BATCH_SIZE, 1, M_SIZE, N_SIZE }; - } - std::vector ref_out_data; - ref_out_data.resize(ov::shape_size(ref_output_shape)); + ref_out_data.resize(ov::shape_size(output_shape_default)); std::vector ref_input_0_data(input_0_data.size()); std::vector ref_input_1_data(input_1_data.size()); @@ -1410,21 +1397,21 @@ class gemm_gpu_tests: public ::testing::Test { input0_shape, sizeof(float), input0_order, - ref_input0_shape); + input0_shape_default); ov::reference::transpose((const char *)(input_1_data.data()), (char *)(ref_input_1_data.data()), input1_shape, sizeof(float), input1_order, - ref_input1_shape); + input1_shape_default); ov::reference::matmul(ref_input_0_data.data(), ref_input_1_data.data(), ref_out_data.data(), - ref_input0_shape, - ref_input1_shape, - ref_output_shape, + input0_shape_default, + input1_shape_default, + output_shape_default, false, false); @@ -1436,44 +1423,28 @@ class gemm_gpu_tests: public ::testing::Test { } } - void test_transpose_matmul_f16(size_t num_dims, bool is_input_dynamic, bool is_caching_test) { + void test_transpose_matmul_f16(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector BMKN, std::vector input0_order, std::vector input1_order) { tests::random_generator rg; rg.set_seed(GET_SUITE_NAME); - const unsigned long BATCH_SIZE = 19; - const unsigned long M_SIZE = 37; - const unsigned long K_SIZE = 23; - const unsigned long N_SIZE = 29; - auto& engine = get_test_engine(); - ov::Shape input0_shape; - ov::Shape input1_shape; - std::vector input0_order; - std::vector input1_order; + ov::Shape input0_shape_default; + ov::Shape input1_shape_default; + ov::Shape output_shape_default; ov::Shape beam_table_shape; cldnn::layout input0_layout; cldnn::layout input1_layout; - if (num_dims == 1) { - input0_shape = { K_SIZE }; - input1_shape = { N_SIZE, K_SIZE }; - input0_order = { 0 }; - input1_order = { 1, 0 }; - } else if (num_dims == 2) { - input0_shape = { K_SIZE, M_SIZE }; - input1_shape = { N_SIZE, K_SIZE }; - input0_order = { 1, 0 }; - input1_order = { 1, 0 }; - } else if (num_dims == 3) { - input0_shape = { BATCH_SIZE, K_SIZE, M_SIZE }; - input1_shape = { N_SIZE, BATCH_SIZE, K_SIZE }; - input0_order = { 0, 2, 1 }; - input1_order = { 1, 2, 0 }; - } else if (num_dims == 4) { - input0_shape = { BATCH_SIZE, K_SIZE, 1, M_SIZE }; - input1_shape = { N_SIZE, BATCH_SIZE, 1, K_SIZE }; - input0_order = { 0, 2, 3, 1 }; - input1_order = { 1, 2, 3, 0 }; + set_default_shapes(num_dims, BMKN, input0_shape_default, input1_shape_default, output_shape_default); + ov::Shape input0_shape(input0_shape_default.size()); + ov::Shape input1_shape(input1_shape_default.size()); + + for (size_t dim = 0; dim < input0_shape_default.size(); ++dim) { + input0_shape[input0_order[dim]] = input0_shape_default[dim]; + } + + for (size_t dim = 0; dim < input1_shape_default.size(); ++dim) { + input1_shape[input1_order[dim]] = input1_shape_default[dim]; } if (is_input_dynamic) { @@ -1516,29 +1487,8 @@ class gemm_gpu_tests: public ::testing::Test { auto output_mem = outputs.at("gemm").get_memory(); cldnn::mem_lock output_ptr(output_mem, get_test_stream()); - ov::Shape ref_input0_shape; - ov::Shape ref_input1_shape; - ov::Shape ref_output_shape; - if (num_dims == 1) { - ref_input0_shape = { K_SIZE }; - ref_input1_shape = { K_SIZE, N_SIZE }; - ref_output_shape = { 1, N_SIZE }; - } else if (num_dims == 2) { - ref_input0_shape = { M_SIZE, K_SIZE }; - ref_input1_shape = { K_SIZE, N_SIZE }; - ref_output_shape = { M_SIZE, N_SIZE }; - } else if (num_dims == 3) { - ref_input0_shape = { BATCH_SIZE, M_SIZE, K_SIZE }; - ref_input1_shape = { BATCH_SIZE, K_SIZE, N_SIZE }; - ref_output_shape = { BATCH_SIZE, M_SIZE, N_SIZE }; - } else if (num_dims == 4) { - ref_input0_shape = { BATCH_SIZE, 1, M_SIZE, K_SIZE }; - ref_input1_shape = { BATCH_SIZE, 1, K_SIZE, N_SIZE }; - ref_output_shape = { BATCH_SIZE, 1, M_SIZE, N_SIZE }; - } - std::vector ref_out_data; - ref_out_data.resize(ov::shape_size(ref_output_shape)); + ref_out_data.resize(ov::shape_size(output_shape_default)); std::vector ref_input_0_data(input_0_data.size()); std::vector ref_input_1_data(input_1_data.size()); @@ -1548,21 +1498,21 @@ class gemm_gpu_tests: public ::testing::Test { input0_shape, sizeof(ov::float16), input0_order, - ref_input0_shape); + input0_shape_default); ov::reference::transpose((const char *)(input_1_data.data()), (char *)(ref_input_1_data.data()), input1_shape, sizeof(ov::float16), input1_order, - ref_input1_shape); + input1_shape_default); ov::reference::matmul(ref_input_0_data.data(), ref_input_1_data.data(), ref_out_data.data(), - ref_input0_shape, - ref_input1_shape, - ref_output_shape, + input0_shape_default, + input1_shape_default, + output_shape_default, false, false); @@ -1601,67 +1551,83 @@ TEST_F(gemm_gpu_tests, dynamic_multi_inference_different_shape) { } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_1d_f16) { - this->test_transpose_matmul_f16(1, true, false); + this->test_transpose_matmul_f16(1, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_1d_f32) { - this->test_transpose_matmul_f32(1, true, false); + this->test_transpose_matmul_f32(1, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_1d_f16) { - this->test_transpose_matmul_f16(1, false, false); + this->test_transpose_matmul_f16(1, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_1d_f32) { - this->test_transpose_matmul_f32(1, false, false); + this->test_transpose_matmul_f32(1, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_2d_f16) { - this->test_transpose_matmul_f16(2, true, false); + this->test_transpose_matmul_f16(2, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{1, 0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_2d_f32) { - this->test_transpose_matmul_f32(2, true, false); + this->test_transpose_matmul_f32(2, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{1, 0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_2d_f16) { - this->test_transpose_matmul_f16(2, false, false); + this->test_transpose_matmul_f16(2, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{1, 0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_2d_f32) { - this->test_transpose_matmul_f32(2, false, false); + this->test_transpose_matmul_f32(2, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{1, 0}, /*input1_order*/{1, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_3d_f16) { - this->test_transpose_matmul_f16(3, true, false); + this->test_transpose_matmul_f16(3, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_3d_f32) { - this->test_transpose_matmul_f32(3, true, false); + this->test_transpose_matmul_f32(3, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16) { - this->test_transpose_matmul_f16(3, false, false); + this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32) { - this->test_transpose_matmul_f32(3, false, false); + this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0}); } -TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16) { - this->test_transpose_matmul_f16(4, true, false); +TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_unaligned) { + this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); +} + +TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_aligned) { + this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{1, 128, 32, 64}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); +} + +TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_unaligned_input1_ylast) { + this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{1, 128, 32, 64}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 3, 2}); } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f32) { - this->test_transpose_matmul_f32(4, true, false); + this->test_transpose_matmul_f32(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f16) { - this->test_transpose_matmul_f16(4, false, false); + this->test_transpose_matmul_f16(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); +} + +TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32_n_tile_16) { + this->test_transpose_matmul_f32(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); +} + +TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32_n_tile_32) { + this->test_transpose_matmul_f32(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); } -TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32) { - this->test_transpose_matmul_f32(4, false, false); +TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32_n_tile_32_input1_ylast) { + this->test_transpose_matmul_f32(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 3, 2}); } TEST_F(gemm_gpu_tests, transpose_matmul_in0_indirect) { @@ -3135,7 +3101,7 @@ TEST_F(gemm_gpu_tests, basic_bfyx_t2_inplace_crop_with_pad_cached) { } TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_cached) { - this->test_transpose_matmul_f16(4, true, true); + this->test_transpose_matmul_f16(4, true, true, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); } TEST_F(gemm_gpu_tests, transpose_matmul_transpose_dynamic_4d_cached) {