From 64f1abf223ed5a82d33ede95ab4b84a267be6b9f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Mar 2022 22:47:50 +0000 Subject: [PATCH 1/5] fix tests --- example/13_gemm_reduce/gemm_xdl_reduce.cpp | 235 ++++++++++++++++++ test/CMakeLists.txt | 20 +- .../{main.cpp => conv2d_bwd_data.cpp} | 0 test/{ => conv2d_fwd}/conv2d_fwd.cpp | 0 test/{conv_util/main.cpp => conv_util.cpp} | 0 .../{main.cpp => convnd_fwd_xdl.cpp} | 0 .../{test_gemm_bf16.cpp => gemm_bf16.cpp} | 0 .../{test_gemm_fp32.cpp => gemm_fp32.cpp} | 0 .../{test_gemm_int8.cpp => gemm_int8.cpp} | 0 .../magic_number_division.cpp | 0 .../{main.cpp => reference_conv_fwd.cpp} | 0 test/{ => split_k}/split_k.cpp | 0 12 files changed, 236 insertions(+), 19 deletions(-) create mode 100644 example/13_gemm_reduce/gemm_xdl_reduce.cpp rename test/conv2d_bwd_data/{main.cpp => conv2d_bwd_data.cpp} (100%) rename test/{ => conv2d_fwd}/conv2d_fwd.cpp (100%) rename test/{conv_util/main.cpp => conv_util.cpp} (100%) rename test/convnd_fwd_xdl/{main.cpp => convnd_fwd_xdl.cpp} (100%) rename test/gemm_xdl/{test_gemm_bf16.cpp => gemm_bf16.cpp} (100%) rename test/gemm_xdl/{test_gemm_fp32.cpp => gemm_fp32.cpp} (100%) rename test/gemm_xdl/{test_gemm_int8.cpp => gemm_int8.cpp} (100%) rename test/{ => magic_number_division}/magic_number_division.cpp (100%) rename test/reference_conv_fwd/{main.cpp => reference_conv_fwd.cpp} (100%) rename test/{ => split_k}/split_k.cpp (100%) diff --git a/example/13_gemm_reduce/gemm_xdl_reduce.cpp b/example/13_gemm_reduce/gemm_xdl_reduce.cpp new file mode 100644 index 00000000000..4dc8d0b7883 --- /dev/null +++ b/example/13_gemm_reduce/gemm_xdl_reduce.cpp @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_gemm.hpp" +#include "device_tensor.hpp" +#include "element_wise_operation.hpp" +#include "device_gemm_xdl_c_shuffle_bias_activation.hpp" +#include "reference_gemm_bias_activation.hpp" + +template +using S = ck::Sequence; + +using ADataType = ck::half_t; +using BDataType = ck::half_t; +using CDataType = ck::half_t; +using AccDataType = float; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +using AElementOp = ck::tensor_operation::element_wise::PassThrough; +using BElementOp = ck::tensor_operation::element_wise::PassThrough; +using CElementOp = ck::tensor_operation::element_wise::AddRelu; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation< + ADataType, // ADataType + BDataType, // BDataType + CDataType, // CDataType + AccDataType, // AccDataType + ALayout, // ALayout + BLayout, // BLayout + CLayout, // CLayout + AElementOp, // AElementwiseOperation + BElementOp, // BElementwiseOperation + CElementOp, // CElementwiseOperation + 256, // BlockSize + 256, // MPerBlock + 128, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXDL + 32, // NPerXDL + 4, // MXdlPerWave + 2, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation; + +int main(int argc, char* argv[]) +{ + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: run kernel # of times (>1)\n"); + printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); + exit(0); + } + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(std::is_same::value) + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + } + else + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({1, stride})); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + // c0_n[n] + Tensor c0_n(HostTensorDescriptor( + std::vector({static_cast(N)}), std::vector({1}))); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + std::cout << "c0_n: " << c0_n.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + c0_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + c0_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + } + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); + DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace()); + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n.mData.data()); + c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data()); + c0_n_device_buf.ToDevice(c0_n.mData.data()); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmInstance{}; + + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + static_cast(c0_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + float ave_time = invoker.Run(argument, nrepeat); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + + sizeof(CDataType) * M * N + sizeof(CDataType) * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + if(do_verification) + { + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + + check_error(c_m_n_host_result, c_m_n_device_result); + } +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 54a44114d0f..0eb22d15c95 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,28 +27,10 @@ function(add_test_executeable TEST_NAME) endfunction(add_test_executeable TEST_NAME) -file(GLOB TESTS *.cpp) +file(GLOB TESTS */*.cpp) foreach(TEST ${TESTS}) get_filename_component(BASE_NAME ${TEST} NAME_WE) message("adding test ${BASE_NAME}") add_test_executeable(test_${BASE_NAME} ${TEST}) endforeach(TEST ${TESTS}) - -# test_gemm_xdl_fp32 -set(GEMM_XDL_FP32_SOURCE gemm_xdl/test_gemm_fp32.cpp) -add_executable(test_gemm_xdl_fp32 ${GEMM_XDL_FP32_SOURCE}) -target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance) - -# test_gemm_xdl_bf16 -set(GEMM_XDL_BF16_SOURCE gemm_xdl/test_gemm_bf16.cpp) -add_executable(test_gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE}) -target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance) - -# test_gemm_xdl_int8 -set(GEMM_XDL_INT8_SOURCE gemm_xdl/test_gemm_int8.cpp) -add_executable(test_gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE}) -target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance) diff --git a/test/conv2d_bwd_data/main.cpp b/test/conv2d_bwd_data/conv2d_bwd_data.cpp similarity index 100% rename from test/conv2d_bwd_data/main.cpp rename to test/conv2d_bwd_data/conv2d_bwd_data.cpp diff --git a/test/conv2d_fwd.cpp b/test/conv2d_fwd/conv2d_fwd.cpp similarity index 100% rename from test/conv2d_fwd.cpp rename to test/conv2d_fwd/conv2d_fwd.cpp diff --git a/test/conv_util/main.cpp b/test/conv_util.cpp similarity index 100% rename from test/conv_util/main.cpp rename to test/conv_util.cpp diff --git a/test/convnd_fwd_xdl/main.cpp b/test/convnd_fwd_xdl/convnd_fwd_xdl.cpp similarity index 100% rename from test/convnd_fwd_xdl/main.cpp rename to test/convnd_fwd_xdl/convnd_fwd_xdl.cpp diff --git a/test/gemm_xdl/test_gemm_bf16.cpp b/test/gemm_xdl/gemm_bf16.cpp similarity index 100% rename from test/gemm_xdl/test_gemm_bf16.cpp rename to test/gemm_xdl/gemm_bf16.cpp diff --git a/test/gemm_xdl/test_gemm_fp32.cpp b/test/gemm_xdl/gemm_fp32.cpp similarity index 100% rename from test/gemm_xdl/test_gemm_fp32.cpp rename to test/gemm_xdl/gemm_fp32.cpp diff --git a/test/gemm_xdl/test_gemm_int8.cpp b/test/gemm_xdl/gemm_int8.cpp similarity index 100% rename from test/gemm_xdl/test_gemm_int8.cpp rename to test/gemm_xdl/gemm_int8.cpp diff --git a/test/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp similarity index 100% rename from test/magic_number_division.cpp rename to test/magic_number_division/magic_number_division.cpp diff --git a/test/reference_conv_fwd/main.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp similarity index 100% rename from test/reference_conv_fwd/main.cpp rename to test/reference_conv_fwd/reference_conv_fwd.cpp diff --git a/test/split_k.cpp b/test/split_k/split_k.cpp similarity index 100% rename from test/split_k.cpp rename to test/split_k/split_k.cpp From ee9c2bd91e039463b2e3e6c4dcf78e4968b3cbdf Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Mar 2022 22:50:37 +0000 Subject: [PATCH 2/5] remove useless file --- example/13_gemm_reduce/gemm_xdl_reduce.cpp | 235 --------------------- 1 file changed, 235 deletions(-) delete mode 100644 example/13_gemm_reduce/gemm_xdl_reduce.cpp diff --git a/example/13_gemm_reduce/gemm_xdl_reduce.cpp b/example/13_gemm_reduce/gemm_xdl_reduce.cpp deleted file mode 100644 index 4dc8d0b7883..00000000000 --- a/example/13_gemm_reduce/gemm_xdl_reduce.cpp +++ /dev/null @@ -1,235 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "config.hpp" -#include "print.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "host_gemm.hpp" -#include "device_tensor.hpp" -#include "element_wise_operation.hpp" -#include "device_gemm_xdl_c_shuffle_bias_activation.hpp" -#include "reference_gemm_bias_activation.hpp" - -template -using S = ck::Sequence; - -using ADataType = ck::half_t; -using BDataType = ck::half_t; -using CDataType = ck::half_t; -using AccDataType = float; - -using ALayout = ck::tensor_layout::gemm::RowMajor; -using BLayout = ck::tensor_layout::gemm::ColumnMajor; -using CLayout = ck::tensor_layout::gemm::RowMajor; - -using AElementOp = ck::tensor_operation::element_wise::PassThrough; -using BElementOp = ck::tensor_operation::element_wise::PassThrough; -using CElementOp = ck::tensor_operation::element_wise::AddRelu; - -// clang-format off -using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation< - ADataType, // ADataType - BDataType, // BDataType - CDataType, // CDataType - AccDataType, // AccDataType - ALayout, // ALayout - BLayout, // BLayout - CLayout, // CLayout - AElementOp, // AElementwiseOperation - BElementOp, // BElementwiseOperation - CElementOp, // CElementwiseOperation - 256, // BlockSize - 256, // MPerBlock - 128, // NPerBlock - 4, // K0PerBlock - 8, // K1 - 32, // MPerXDL - 32, // NPerXDL - 4, // MXdlPerWave - 2, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_K1 - true, // ABlockLdsAddExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_K1 - true, // BBlockLdsAddExtraN - 1, // CShuffleMXdlPerWavePerShuffle - 1, // CShuffleNXdlPerWavePerShuffle - S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl - 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl -// clang-format on - -using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation; - -int main(int argc, char* argv[]) -{ - bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; - - // GEMM shape - ck::index_t M = 3840; - ck::index_t N = 4096; - ck::index_t K = 4096; - - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; - - if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - } - else if(argc == 10) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); - - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - - StrideA = std::stoi(argv[7]); - StrideB = std::stoi(argv[8]); - StrideC = std::stoi(argv[9]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); - exit(0); - } - - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(std::is_same::value) - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - } - else - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); - } - }; - - Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); - Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - - // c0_n[n] - Tensor c0_n(HostTensorDescriptor( - std::vector({static_cast(N)}), std::vector({1}))); - - std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; - std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; - std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; - std::cout << "c0_n: " << c0_n.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - c0_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - c0_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - } - - DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); - DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); - DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); - DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace()); - - a_m_k_device_buf.ToDevice(a_m_k.mData.data()); - b_k_n_device_buf.ToDevice(b_k_n.mData.data()); - c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data()); - c0_n_device_buf.ToDevice(c0_n.mData.data()); - - auto a_element_op = AElementOp{}; - auto b_element_op = BElementOp{}; - auto c_element_op = CElementOp{}; - - // do GEMM - auto gemm = DeviceGemmInstance{}; - - auto invoker = gemm.MakeInvoker(); - auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), - static_cast(b_k_n_device_buf.GetDeviceBuffer()), - static_cast(c_m_n_device_buf.GetDeviceBuffer()), - static_cast(c0_n_device_buf.GetDeviceBuffer()), - M, - N, - K, - StrideA, - StrideB, - StrideC, - a_element_op, - b_element_op, - c_element_op); - - if(!gemm.IsSupportedArgument(argument)) - { - throw std::runtime_error( - "wrong! device_gemm with the specified compilation parameters does " - "not support this GEMM problem"); - } - - float ave_time = invoker.Run(argument, nrepeat); - - std::size_t flop = std::size_t(2) * M * N * K; - - std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + - sizeof(CDataType) * M * N + sizeof(CDataType) * N; - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; - - c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); - - if(do_verification) - { - auto ref_gemm = ReferenceGemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = ref_gemm.MakeArgument( - a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op); - - ref_invoker.Run(ref_argument); - - check_error(c_m_n_host_result, c_m_n_device_result); - } -} From 730e2f1c565602913f0c0bd63c0fdf8b96536253 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 5 Mar 2022 01:08:11 +0000 Subject: [PATCH 3/5] fix test build --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0eb22d15c95..4de43065cc0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,12 +21,12 @@ function(add_test_executeable TEST_NAME) target_link_libraries(${TEST_NAME} PRIVATE host_tensor) target_link_libraries(${TEST_NAME} PRIVATE device_gemm_instance) target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_fwd_instance) + target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_bwd_data_instance) add_test(NAME ${TEST_NAME} COMMAND $ ) add_dependencies(tests ${TEST_NAME}) add_dependencies(check ${TEST_NAME}) endfunction(add_test_executeable TEST_NAME) - file(GLOB TESTS */*.cpp) foreach(TEST ${TESTS}) From 5b70abeff6bd6868938ef760ebb8fc63ecadc63b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 5 Mar 2022 03:07:48 +0000 Subject: [PATCH 4/5] reduce parallelism when compiling --- Jenkinsfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8d1fbc2578a..c2f9d96afe1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -60,7 +60,8 @@ def cmake_build(Map conf=[:]){ cd build """ def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") - def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 4 )) ${config_targets}") + // reduce parallelism when compiling, clang uses too much memory + def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 5 )) ${config_targets}") def execute_cmd = conf.get("execute_cmd", "") def cmd = conf.get("cmd", """ From 7fc8f02760dea0e92409b6eb190f8af5bcb3d43e Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 5 Mar 2022 05:15:28 +0000 Subject: [PATCH 5/5] fix test --- test/conv2d_bwd_data/conv2d_bwd_data.cpp | 22 ++++++++++++++-------- test/conv2d_fwd/conv2d_fwd.cpp | 7 +++---- test/{ => conv_util}/conv_util.cpp | 0 3 files changed, 17 insertions(+), 12 deletions(-) rename test/{ => conv_util}/conv_util.cpp (100%) diff --git a/test/conv2d_bwd_data/conv2d_bwd_data.cpp b/test/conv2d_bwd_data/conv2d_bwd_data.cpp index 72ed6ee0743..0d265963963 100644 --- a/test/conv2d_bwd_data/conv2d_bwd_data.cpp +++ b/test/conv2d_bwd_data/conv2d_bwd_data.cpp @@ -11,8 +11,9 @@ using F16 = ck::half_t; using F32 = float; -using BF16 = ushort; +using BF16 = ck::bhalf_t; using INT8 = int8_t; + namespace ck { namespace tensor_operation { namespace device { @@ -22,6 +23,7 @@ using DeviceConvBwdDataNoOpPtr = DeviceConvBwdDataPtr; + void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( std::vector&); void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( @@ -30,6 +32,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( std::vector&); void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( std::vector&); + } // namespace device_conv2d_bwd_data_instance } // namespace device } // namespace tensor_operation @@ -78,7 +81,12 @@ int main(int argc, char* argv[]) ck::index_t in_right_pad_h = 1; ck::index_t in_right_pad_w = 1; - if(argc == 3) + if(argc == 1) + { + data_type = 1; + init_method = 1; + } + else if(argc == 3) { data_type = std::stoi(argv[1]); init_method = std::stoi(argv[2]); @@ -106,11 +114,9 @@ int main(int argc, char* argv[]) } else { - printf("arg1: data type (0=fp32 )\n"); - printf("arg2: verification (0=no, 1=yes)\n"); - printf("arg3: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg4: run kernel # of times (>1)\n"); - printf("arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); } @@ -296,7 +302,7 @@ int main(int argc, char* argv[]) if(data_type == 0) { - Run(float(), float(), F32()); + Run(F32(), F32(), F32()); } else if(data_type == 1) { diff --git a/test/conv2d_fwd/conv2d_fwd.cpp b/test/conv2d_fwd/conv2d_fwd.cpp index 26f348b21a8..164d4a1cc10 100644 --- a/test/conv2d_fwd/conv2d_fwd.cpp +++ b/test/conv2d_fwd/conv2d_fwd.cpp @@ -77,8 +77,8 @@ int main(int argc, char* argv[]) ck::index_t in_right_pad_w = 1; if(argc == 1) { + data_type = 1; init_method = 1; - data_type = 0; } else if(argc == 3) { @@ -108,10 +108,9 @@ int main(int argc, char* argv[]) } else { - printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: run kernel # of times (>1)\n"); - printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " + printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " "RightPx\n"); exit(1); } diff --git a/test/conv_util.cpp b/test/conv_util/conv_util.cpp similarity index 100% rename from test/conv_util.cpp rename to test/conv_util/conv_util.cpp