From 64f1abf223ed5a82d33ede95ab4b84a267be6b9f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 4 Mar 2022 22:47:50 +0000
Subject: [PATCH 1/5] fix tests

---
 example/13_gemm_reduce/gemm_xdl_reduce.cpp    | 235 ++++++++++++++++++
 test/CMakeLists.txt                           |  20 +-
 .../{main.cpp => conv2d_bwd_data.cpp}         |   0
 test/{ => conv2d_fwd}/conv2d_fwd.cpp          |   0
 test/{conv_util/main.cpp => conv_util.cpp}    |   0
 .../{main.cpp => convnd_fwd_xdl.cpp}          |   0
 .../{test_gemm_bf16.cpp => gemm_bf16.cpp}     |   0
 .../{test_gemm_fp32.cpp => gemm_fp32.cpp}     |   0
 .../{test_gemm_int8.cpp => gemm_int8.cpp}     |   0
 .../magic_number_division.cpp                 |   0
 .../{main.cpp => reference_conv_fwd.cpp}      |   0
 test/{ => split_k}/split_k.cpp                |   0
 12 files changed, 236 insertions(+), 19 deletions(-)
 create mode 100644 example/13_gemm_reduce/gemm_xdl_reduce.cpp
 rename test/conv2d_bwd_data/{main.cpp => conv2d_bwd_data.cpp} (100%)
 rename test/{ => conv2d_fwd}/conv2d_fwd.cpp (100%)
 rename test/{conv_util/main.cpp => conv_util.cpp} (100%)
 rename test/convnd_fwd_xdl/{main.cpp => convnd_fwd_xdl.cpp} (100%)
 rename test/gemm_xdl/{test_gemm_bf16.cpp => gemm_bf16.cpp} (100%)
 rename test/gemm_xdl/{test_gemm_fp32.cpp => gemm_fp32.cpp} (100%)
 rename test/gemm_xdl/{test_gemm_int8.cpp => gemm_int8.cpp} (100%)
 rename test/{ => magic_number_division}/magic_number_division.cpp (100%)
 rename test/reference_conv_fwd/{main.cpp => reference_conv_fwd.cpp} (100%)
 rename test/{ => split_k}/split_k.cpp (100%)
diff --git a/example/13_gemm_reduce/gemm_xdl_reduce.cpp b/example/13_gemm_reduce/gemm_xdl_reduce.cpp
new file mode 100644
index 00000000000..4dc8d0b7883
--- /dev/null
+++ b/example/13_gemm_reduce/gemm_xdl_reduce.cpp
@@ -0,0 +1,235 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "element_wise_operation.hpp"
+#include "device_gemm_xdl_c_shuffle_bias_activation.hpp"
+#include "reference_gemm_bias_activation.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation<
+    ADataType,              // ADataType
+    BDataType,              // BDataType
+    CDataType,              // CDataType
+    AccDataType,            // AccDataType
+    ALayout,                // ALayout
+    BLayout,                // BLayout
+    CLayout,                // CLayout
+    AElementOp,             // AElementwiseOperation
+    BElementOp,             // BElementwiseOperation
+    CElementOp,             // CElementwiseOperation
+    256,                    // BlockSize
+    256,                    // MPerBlock
+    128,                    // NPerBlock
+    4,                      // K0PerBlock
+    8,                      // K1
+    32,                     // MPerXDL
+    32,                     // NPerXDL
+    4,                      // MXdlPerWave
+    2,                      // NXdlPerWave
+    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
+    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
+    2,                      // ABlockTransferSrcVectorDim
+    8,                      // ABlockTransferSrcScalarPerVector
+    8,                      // ABlockTransferDstScalarPerVector_K1
+    true,                   // ABlockLdsAddExtraM
+    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
+    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
+    2,                      // BBlockTransferSrcVectorDim
+    8,                      // BBlockTransferSrcScalarPerVector
+    8,                      // BBlockTransferDstScalarPerVector_K1
+    true,                   // BBlockLdsAddExtraN
+    1,                      // CShuffleMXdlPerWavePerShuffle
+    1,                      // CShuffleNXdlPerWavePerShuffle
+    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation<ADataType,
+                                                                                      BDataType,
+                                                                                      CDataType,
+                                                                                      AElementOp,
+                                                                                      BElementOp,
+                                                                                      CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    // c0_n[n]
+    Tensor<CDataType> c0_n(HostTensorDescriptor(
+        std::vector<std::size_t>({static_cast<std::size_t>(N)}), std::vector<std::size_t>({1})));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "c0_n: " << c0_n.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        c0_n.GenerateTensorValue(GeneratorTensor_2<CDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        c0_n.GenerateTensorValue(GeneratorTensor_3<CDataType>{0.0, 1.0});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
+    c0_n_device_buf.ToDevice(c0_n.mData.data());
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm = DeviceGemmInstance{};
+
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c0_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, nrepeat);
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
+                            sizeof(CDataType) * M * N + sizeof(CDataType) * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        check_error(c_m_n_host_result, c_m_n_device_result);
+    }
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 54a44114d0f..0eb22d15c95 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -27,28 +27,10 @@ function(add_test_executeable TEST_NAME)
 endfunction(add_test_executeable TEST_NAME)
 
 
-file(GLOB TESTS *.cpp)
+file(GLOB TESTS */*.cpp)
 
 foreach(TEST ${TESTS})
     get_filename_component(BASE_NAME ${TEST} NAME_WE)
     message("adding test ${BASE_NAME}")
     add_test_executeable(test_${BASE_NAME} ${TEST})
 endforeach(TEST ${TESTS})
-
-# test_gemm_xdl_fp32
-set(GEMM_XDL_FP32_SOURCE gemm_xdl/test_gemm_fp32.cpp)
-add_executable(test_gemm_xdl_fp32 ${GEMM_XDL_FP32_SOURCE})
-target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
-
-# test_gemm_xdl_bf16
-set(GEMM_XDL_BF16_SOURCE gemm_xdl/test_gemm_bf16.cpp)
-add_executable(test_gemm_xdl_bf16 ${GEMM_XDL_BF16_SOURCE})
-target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
-
-# test_gemm_xdl_int8
-set(GEMM_XDL_INT8_SOURCE gemm_xdl/test_gemm_int8.cpp)
-add_executable(test_gemm_xdl_int8 ${GEMM_XDL_INT8_SOURCE})
-target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
diff --git a/test/conv2d_bwd_data/main.cpp b/test/conv2d_bwd_data/conv2d_bwd_data.cpp
similarity index 100%
rename from test/conv2d_bwd_data/main.cpp
rename to test/conv2d_bwd_data/conv2d_bwd_data.cpp
diff --git a/test/conv2d_fwd.cpp b/test/conv2d_fwd/conv2d_fwd.cpp
similarity index 100%
rename from test/conv2d_fwd.cpp
rename to test/conv2d_fwd/conv2d_fwd.cpp
diff --git a/test/conv_util/main.cpp b/test/conv_util.cpp
similarity index 100%
rename from test/conv_util/main.cpp
rename to test/conv_util.cpp
diff --git a/test/convnd_fwd_xdl/main.cpp b/test/convnd_fwd_xdl/convnd_fwd_xdl.cpp
similarity index 100%
rename from test/convnd_fwd_xdl/main.cpp
rename to test/convnd_fwd_xdl/convnd_fwd_xdl.cpp
diff --git a/test/gemm_xdl/test_gemm_bf16.cpp b/test/gemm_xdl/gemm_bf16.cpp
similarity index 100%
rename from test/gemm_xdl/test_gemm_bf16.cpp
rename to test/gemm_xdl/gemm_bf16.cpp
diff --git a/test/gemm_xdl/test_gemm_fp32.cpp b/test/gemm_xdl/gemm_fp32.cpp
similarity index 100%
rename from test/gemm_xdl/test_gemm_fp32.cpp
rename to test/gemm_xdl/gemm_fp32.cpp
diff --git a/test/gemm_xdl/test_gemm_int8.cpp b/test/gemm_xdl/gemm_int8.cpp
similarity index 100%
rename from test/gemm_xdl/test_gemm_int8.cpp
rename to test/gemm_xdl/gemm_int8.cpp
diff --git a/test/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
similarity index 100%
rename from test/magic_number_division.cpp
rename to test/magic_number_division/magic_number_division.cpp
diff --git a/test/reference_conv_fwd/main.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
similarity index 100%
rename from test/reference_conv_fwd/main.cpp
rename to test/reference_conv_fwd/reference_conv_fwd.cpp
diff --git a/test/split_k.cpp b/test/split_k/split_k.cpp
similarity index 100%
rename from test/split_k.cpp
rename to test/split_k/split_k.cpp

From ee9c2bd91e039463b2e3e6c4dcf78e4968b3cbdf Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 4 Mar 2022 22:50:37 +0000
Subject: [PATCH 2/5] remove useless file

---
 example/13_gemm_reduce/gemm_xdl_reduce.cpp | 235 ---------------------
 1 file changed, 235 deletions(-)
 delete mode 100644 example/13_gemm_reduce/gemm_xdl_reduce.cpp

diff --git a/example/13_gemm_reduce/gemm_xdl_reduce.cpp b/example/13_gemm_reduce/gemm_xdl_reduce.cpp
deleted file mode 100644
index 4dc8d0b7883..00000000000
--- a/example/13_gemm_reduce/gemm_xdl_reduce.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_gemm.hpp"
-#include "device_tensor.hpp"
-#include "element_wise_operation.hpp"
-#include "device_gemm_xdl_c_shuffle_bias_activation.hpp"
-#include "reference_gemm_bias_activation.hpp"
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
-
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::AddRelu;
-
-// clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_Activation<
-    ADataType,              // ADataType
-    BDataType,              // BDataType
-    CDataType,              // CDataType
-    AccDataType,            // AccDataType
-    ALayout,                // ALayout
-    BLayout,                // BLayout
-    CLayout,                // CLayout
-    AElementOp,             // AElementwiseOperation
-    BElementOp,             // BElementwiseOperation
-    CElementOp,             // CElementwiseOperation
-    256,                    // BlockSize
-    256,                    // MPerBlock
-    128,                    // NPerBlock
-    4,                      // K0PerBlock
-    8,                      // K1
-    32,                     // MPerXDL
-    32,                     // NPerXDL
-    4,                      // MXdlPerWave
-    2,                      // NXdlPerWave
-    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
-    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
-    2,                      // ABlockTransferSrcVectorDim
-    8,                      // ABlockTransferSrcScalarPerVector
-    8,                      // ABlockTransferDstScalarPerVector_K1
-    true,                   // ABlockLdsAddExtraM
-    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
-    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
-    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
-    2,                      // BBlockTransferSrcVectorDim
-    8,                      // BBlockTransferSrcScalarPerVector
-    8,                      // BBlockTransferDstScalarPerVector_K1
-    true,                   // BBlockLdsAddExtraN
-    1,                      // CShuffleMXdlPerWavePerShuffle
-    1,                      // CShuffleNXdlPerWavePerShuffle
-    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
-    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-
-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBiasActivation<ADataType,
-                                                                                      BDataType,
-                                                                                      CDataType,
-                                                                                      AElementOp,
-                                                                                      BElementOp,
-                                                                                      CElementOp>;
-
-int main(int argc, char* argv[])
-{
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
-
-    // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
-
-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
-
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-    }
-    else if(argc == 10)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-
-        M = std::stoi(argv[4]);
-        N = std::stoi(argv[5]);
-        K = std::stoi(argv[6]);
-
-        StrideA = std::stoi(argv[7]);
-        StrideB = std::stoi(argv[8]);
-        StrideC = std::stoi(argv[9]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
-        exit(0);
-    }
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({stride, 1}));
-            }
-            else
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                            std::vector<std::size_t>({1, stride}));
-            }
-        };
-
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
-    // c0_n[n]
-    Tensor<CDataType> c0_n(HostTensorDescriptor(
-        std::vector<std::size_t>({static_cast<std::size_t>(N)}), std::vector<std::size_t>({1})));
-
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    std::cout << "c0_n: " << c0_n.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        c0_n.GenerateTensorValue(GeneratorTensor_2<CDataType>{-5, 5});
-        break;
-    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-        c0_n.GenerateTensorValue(GeneratorTensor_3<CDataType>{0.0, 1.0});
-    }
-
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
-    DeviceMem c0_n_device_buf(sizeof(CDataType) * c0_n.mDesc.GetElementSpace());
-
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
-    c0_n_device_buf.ToDevice(c0_n.mData.data());
-
-    auto a_element_op = AElementOp{};
-    auto b_element_op = BElementOp{};
-    auto c_element_op = CElementOp{};
-
-    // do GEMM
-    auto gemm = DeviceGemmInstance{};
-
-    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                      static_cast<CDataType*>(c0_n_device_buf.GetDeviceBuffer()),
-                                      M,
-                                      N,
-                                      K,
-                                      StrideA,
-                                      StrideB,
-                                      StrideC,
-                                      a_element_op,
-                                      b_element_op,
-                                      c_element_op);
-
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
-    }
-
-    float ave_time = invoker.Run(argument, nrepeat);
-
-    std::size_t flop = std::size_t(2) * M * N * K;
-
-    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * M +
-                            sizeof(CDataType) * M * N + sizeof(CDataType) * N;
-
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-
-    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-    if(do_verification)
-    {
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, c0_n, a_element_op, b_element_op, c_element_op);
-
-        ref_invoker.Run(ref_argument);
-
-        check_error(c_m_n_host_result, c_m_n_device_result);
-    }
-}

From 730e2f1c565602913f0c0bd63c0fdf8b96536253 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 5 Mar 2022 01:08:11 +0000
Subject: [PATCH 3/5] fix test build

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0eb22d15c95..4de43065cc0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,12 +21,12 @@ function(add_test_executeable TEST_NAME)
     target_link_libraries(${TEST_NAME} PRIVATE host_tensor)
     target_link_libraries(${TEST_NAME} PRIVATE device_gemm_instance)
     target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_fwd_instance)
+    target_link_libraries(${TEST_NAME} PRIVATE device_conv2d_bwd_data_instance)
     add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
     add_dependencies(tests ${TEST_NAME})
     add_dependencies(check ${TEST_NAME})
 endfunction(add_test_executeable TEST_NAME)
 
-
 file(GLOB TESTS */*.cpp)
 
 foreach(TEST ${TESTS})

From 5b70abeff6bd6868938ef760ebb8fc63ecadc63b Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 5 Mar 2022 03:07:48 +0000
Subject: [PATCH 4/5] reduce parallelism when compiling

---
 Jenkinsfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8d1fbc2578a..c2f9d96afe1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -60,7 +60,8 @@ def cmake_build(Map conf=[:]){
             cd build
         """
     def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 4 )) ${config_targets}")
+    // reduce parallelism when compiling, clang uses too much memory
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 5 )) ${config_targets}")
     def execute_cmd = conf.get("execute_cmd", "")
 
     def cmd = conf.get("cmd", """

From 7fc8f02760dea0e92409b6eb190f8af5bcb3d43e Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 5 Mar 2022 05:15:28 +0000
Subject: [PATCH 5/5] fix test

---
 test/conv2d_bwd_data/conv2d_bwd_data.cpp | 22 ++++++++++++++--------
 test/conv2d_fwd/conv2d_fwd.cpp           |  7 +++----
 test/{ => conv_util}/conv_util.cpp       |  0
 3 files changed, 17 insertions(+), 12 deletions(-)
 rename test/{ => conv_util}/conv_util.cpp (100%)

diff --git a/test/conv2d_bwd_data/conv2d_bwd_data.cpp b/test/conv2d_bwd_data/conv2d_bwd_data.cpp
index 72ed6ee0743..0d265963963 100644
--- a/test/conv2d_bwd_data/conv2d_bwd_data.cpp
+++ b/test/conv2d_bwd_data/conv2d_bwd_data.cpp
@@ -11,8 +11,9 @@
 
 using F16  = ck::half_t;
 using F32  = float;
-using BF16 = ushort;
+using BF16 = ck::bhalf_t;
 using INT8 = int8_t;
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -22,6 +23,7 @@ using DeviceConvBwdDataNoOpPtr =
     DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
                          ck::tensor_operation::element_wise::PassThrough,
                          ck::tensor_operation::element_wise::PassThrough>;
+
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
     std::vector<DeviceConvBwdDataNoOpPtr>&);
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
@@ -30,6 +32,7 @@ void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
     std::vector<DeviceConvBwdDataNoOpPtr>&);
 void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
     std::vector<DeviceConvBwdDataNoOpPtr>&);
+
 } // namespace device_conv2d_bwd_data_instance
 } // namespace device
 } // namespace tensor_operation
@@ -78,7 +81,12 @@ int main(int argc, char* argv[])
     ck::index_t in_right_pad_h  = 1;
     ck::index_t in_right_pad_w  = 1;
 
-    if(argc == 3)
+    if(argc == 1)
+    {
+        data_type   = 1;
+        init_method = 1;
+    }
+    else if(argc == 3)
     {
         data_type   = std::stoi(argv[1]);
         init_method = std::stoi(argv[2]);
@@ -106,11 +114,9 @@ int main(int argc, char* argv[])
     }
     else
     {
-        printf("arg1: data type (0=fp32 )\n");
-        printf("arg2: verification (0=no, 1=yes)\n");
-        printf("arg3: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg4: run kernel # of times (>1)\n");
-        printf("arg5 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+        printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
     }
@@ -296,7 +302,7 @@ int main(int argc, char* argv[])
 
     if(data_type == 0)
     {
-        Run(float(), float(), F32());
+        Run(F32(), F32(), F32());
     }
     else if(data_type == 1)
     {
diff --git a/test/conv2d_fwd/conv2d_fwd.cpp b/test/conv2d_fwd/conv2d_fwd.cpp
index 26f348b21a8..164d4a1cc10 100644
--- a/test/conv2d_fwd/conv2d_fwd.cpp
+++ b/test/conv2d_fwd/conv2d_fwd.cpp
@@ -77,8 +77,8 @@ int main(int argc, char* argv[])
     ck::index_t in_right_pad_w  = 1;
     if(argc == 1)
     {
+        data_type   = 1;
         init_method = 1;
-        data_type   = 0;
     }
     else if(argc == 3)
     {
@@ -108,10 +108,9 @@ int main(int argc, char* argv[])
     }
     else
     {
-        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
         printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
                "RightPx\n");
         exit(1);
     }
diff --git a/test/conv_util.cpp b/test/conv_util/conv_util.cpp
similarity index 100%
rename from test/conv_util.cpp
rename to test/conv_util/conv_util.cpp