From 59ac6f84cd14e6fbf342b04483d0cf25a3d8e972 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Tue, 17 Aug 2021 15:06:18 +0800
Subject: [PATCH 01/21] start

---
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp | 129 ++++++++
 host/driver_offline/CMakeLists.txt            |   3 +
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 197 ++++++++++++
 .../src/conv_wrw_driver_offline.cpp           | 297 ++++++++++++++++++
 4 files changed, 626 insertions(+)
 create mode 100644 composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
 create mode 100644 host/driver_offline/src/conv_wrw_driver_offline.cpp
diff --git a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000000..6708780e7f5
--- /dev/null
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,129 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/host/driver_offline/CMakeLists.txt b/host/driver_offline/CMakeLists.txt
index fec11e99afe..8dec70d03f0 100644
--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -13,9 +13,12 @@ include_directories(BEFORE
 
 set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
+set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
 
 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
 
 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000000..520eb65483e
--- /dev/null
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,197 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+#include "driver_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc  = make_naive_tensor_descriptor_packed(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperationEnum_t::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<0, 2, 1>,
+            Sequence<1, 0, 2>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
+            decltype(out_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
+            false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                   static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                   static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                   wei_gemmk0_gemmm_gemmk1_grid_desc,
+                   in_gemmk0_gemmn_gemmk1_grid_desc,
+                   out_gemmm_gemmn_grid_desc,
+                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_step_hacks,
+                   out_m0_m1_m2_n_grid_step_hacks,
+                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+                   in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
+                   nrepeat);
+
+        float perf = static_cast<float>(calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc)) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
new file mode 100644
index 00000000000..98d3a382476
--- /dev/null
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -0,0 +1,297 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv.hpp"
+#include "device_tensor.hpp"
+#include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+
+
+#define USE_MODE 1
+#define USE_CONV_WRW_V4R4R2_XDL_NCHW 1
+
+enum ConvBackwardWeightAlgo
+{
+    V4R4R2XDLNCHW, 
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+#if USE_MODE
+    // dynamic mode
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardWeightAlgo algo    = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
+
+    const index_t N  = std::stoi(argv[7]);
+    const index_t K  = std::stoi(argv[8]);
+    const index_t C  = std::stoi(argv[9]);
+    const index_t Y  = std::stoi(argv[10]);
+    const index_t X  = std::stoi(argv[11]);
+    const index_t Hi = std::stoi(argv[12]);
+    const index_t Wi = std::stoi(argv[13]);
+
+    const index_t conv_stride_h   = std::stoi(argv[14]);
+    const index_t conv_stride_w   = std::stoi(argv[15]);
+    const index_t conv_dilation_h = std::stoi(argv[16]);
+    const index_t conv_dilation_w = std::stoi(argv[17]);
+    const index_t in_left_pad_h   = std::stoi(argv[18]);
+    const index_t in_left_pad_w   = std::stoi(argv[19]);
+    const index_t in_right_pad_h  = std::stoi(argv[20]);
+    const index_t in_right_pad_w  = std::stoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#else
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardWeightAlgo algo    = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
+    const bool do_verification    = std::stoi(argv[3]);
+    const int init_method         = std::stoi(argv[4]);
+    const bool do_log             = std::stoi(argv[5]);
+    const int nrepeat             = std::stoi(argv[6]);
+
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#endif
+
+#if 1
+    using in_data_t  = float;
+    using acc_data_t = float;
+    using out_data_t = float;
+#elif 1
+    using in_data_t  = half_t;
+    using acc_data_t = float;
+    using out_data_t = half_t;
+#elif 1
+    using in_data_t  = int8_t;
+    using acc_data_t = int32_t;
+    using out_data_t = int8_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    if(layout == ConvTensorLayout::NCHW)
+    {
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+    }
+    else
+    {
+        std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out_host(out_lengths_host);
+    Tensor<out_data_t> out_device(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    switch(init_method)
+    {
+    case 0:
+        // no initialization
+        break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 3:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        break;
+    case 4:
+        in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        break;
+    case 5:
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
+        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei.GenerateTensorValue(gen_wei, num_thread);
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+#if USE_MODE
+        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+
+#if USE_CONV_WRW_V4R4R2_XDL_NCHW
+    if(algo == ConvBackwardWeightAlgo::V4R4R2XDLNCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+                                                                              acc_data_t,
+                                                                              out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution(in,
+                                wei,
+                                out_host,
+                                make_tuple(conv_stride_h, conv_stride_w),
+                                make_tuple(conv_dilation_h, conv_dilation_w),
+                                make_tuple(in_left_pad_h, in_left_pad_w),
+                                make_tuple(in_right_pad_h, in_right_pad_w),
+                                layout);
+
+        check_error(out_host, out_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+        }
+    }
+}

From 0bf754ec7d9e332d21568c03f318861ef6bd1399 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Tue, 17 Aug 2021 15:22:48 +0800
Subject: [PATCH 02/21] modify transformat

---
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
index 6708780e7f5..367e39c2cc1 100644
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -62,23 +62,16 @@ transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
     const auto InRightPadW = in_right_pads[I1];
 
     const auto GemmM  = K;
-    const auto GemmN  = N * Ho * Wo;
-    const auto GemmK  = C * Y * X;
+    const auto GemmN  = C * Y * X;
+    const auto GemmK  = N * Ho * Wo;
     const auto GemmK0 = GemmK / GemmK1;
 
     // weight tensor
-    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
         make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
         make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
         make_tuple(Sequence<0>{}, Sequence<1>{}),
-        make_tuple(Sequence<1>{}, Sequence<0>{}));
-
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
-        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
-                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
-                                               make_pass_through_transform(GemmM)),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
 
     // input tensor
     const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
@@ -104,7 +97,7 @@ transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
                                     make_tuple(make_merge_transform(make_tuple(C, Y, X)),
                                                make_merge_transform(make_tuple(N, Ho, Wo))),
                                     make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
 
     const auto in_gemmk0_gemmn_gemmk1_grid_desc =
         transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
@@ -114,15 +107,22 @@ transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
                                     make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
     // output tensor
-    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+    const auto out_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
         make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
         make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
         make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(out_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
-    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
                       in_gemmk0_gemmn_gemmk1_grid_desc,
-                      out_gemmm_gemmn_grid_desc);
+                      wei_gemmm_gemmn_grid_desc);
 }
 
 } // namespace ck

From 6a963f9bf759308b094b65bcb71c035cb37a2e5d Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Tue, 17 Aug 2021 15:34:57 +0800
Subject: [PATCH 03/21] modify device convolutiion

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 520eb65483e..dcda7a29e45 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -23,8 +23,8 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     const InLeftPads& in_left_pads,
     const InRightPads& in_right_pads,
     const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
+    Tensor<TInWei>& wei_k_c_y_x,
+    const Tensor<TOut>& out_n_k_ho_wo,
     ck::index_t nrepeat)
 {
     using namespace ck;
@@ -87,12 +87,12 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                                                                           in_right_pads,
                                                                           Number<GemmK1>{});
 
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
     const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
-    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+    const auto wei_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
         make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
         make_tuple(
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
@@ -105,7 +105,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
 
-    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
+    constexpr auto wei_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
@@ -123,7 +123,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
@@ -137,9 +137,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             TAcc,
             TOut,
             InMemoryDataOperationEnum_t::Set,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
             decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
-            decltype(out_gemmm_gemmn_grid_desc),
+            decltype(wei_gemmm_gemmn_grid_desc),
             GemmMPerBlock,
             GemmNPerBlock,
             GemmKPerBlock,
@@ -167,21 +167,21 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             Sequence<3, 0, 1, 2, 7, 5, 4, 6>,
             7,
             GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_step_hacks),
             decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
-            decltype(out_m0_m1_m2_n_grid_step_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
+            decltype(wei_m0_m1_m2_n_grid_step_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
             decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
-            false>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            false>(static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
                    static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
-                   static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
-                   wei_gemmk0_gemmm_gemmk1_grid_desc,
+                   static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                   out_gemmk0_gemmm_gemmk1_grid_desc,
                    in_gemmk0_gemmn_gemmk1_grid_desc,
-                   out_gemmm_gemmn_grid_desc,
-                   wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
+                   wei_gemmm_gemmn_grid_desc,
+                   out_gemmk0_gemmm_gemmk1_grid_step_hacks,
                    in_gemmk0_gemmn_gemmk1_grid_step_hacks,
-                   out_m0_m1_m2_n_grid_step_hacks,
-                   wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
+                   wei_m0_m1_m2_n_grid_step_hacks,
+                   out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
                    in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
                    nrepeat);
 
@@ -193,5 +193,5 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     }
 
     // copy result back to host
-    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+    wei_k_c_y_x_device_buf.FromDevice(wei_k_c_y_x.mData.data());
 }

From 7e0e97d4901455fa9c3802484f514a628944ba91 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Tue, 17 Aug 2021 15:46:33 +0800
Subject: [PATCH 04/21] modify host

---
 .../src/conv_wrw_driver_offline.cpp           | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 98d3a382476..ae61045cb99 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -164,14 +164,14 @@ int main(int argc, char* argv[])
     }
 
     Tensor<in_data_t> in(in_lengths_host);
-    Tensor<in_data_t> wei(wei_lengths_host);
-    Tensor<out_data_t> out_host(out_lengths_host);
-    Tensor<out_data_t> out_device(out_lengths_host);
+    Tensor<in_data_t> wei_device(wei_lengths_host);
+    Tensor<out_data_t> wei_host(wei_lengths_host);
+    Tensor<out_data_t> out(out_lengths_host);
 
     std::cout << "layout: " << layout << std::endl;
     ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
-    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
-    ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
+    ostream_HostTensorDescriptor(wei_host.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
     print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
     print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
@@ -186,31 +186,31 @@ int main(int argc, char* argv[])
         break;
     case 1:
         in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
         break;
     case 2:
         in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         break;
     case 3:
         in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
         break;
     case 4:
         in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         break;
     case 5:
         in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
         break;
     default:
         in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
 
-        auto gen_wei = [](auto... is) {
+        auto gen_out = [](auto... is) {
             return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
         };
-        wei.GenerateTensorValue(gen_wei, num_thread);
+        out.GenerateTensorValue(gen_out, num_thread);
     }
 
     auto f_make_for_device_nchw = [&]() {
@@ -267,8 +267,8 @@ int main(int argc, char* argv[])
             tmp[I5],
             tmp[I6],
             in,
-            wei,
-            out_device,
+            wei_device,
+            out,
             nrepeat);
     }
 #endif
@@ -276,22 +276,22 @@ int main(int argc, char* argv[])
     if(do_verification)
     {
         host_direct_convolution(in,
-                                wei,
-                                out_host,
+                                wei_host,
+                                out,
                                 make_tuple(conv_stride_h, conv_stride_w),
                                 make_tuple(conv_dilation_h, conv_dilation_w),
                                 make_tuple(in_left_pad_h, in_left_pad_w),
                                 make_tuple(in_right_pad_h, in_right_pad_w),
                                 layout);
 
-        check_error(out_host, out_device);
+        check_error(wei_host, wei_device);
 
         if(do_log)
         {
             LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei_device: ", wei_device.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
         }
     }
 }

From 16d78f5d50997cb5a9bda6c7c72e9a9d4ac915ef Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@amd.com>
Date: Tue, 17 Aug 2021 14:26:15 +0000
Subject: [PATCH 05/21] added host conv bwd and wrw

---
 host/host_tensor/include/conv_common.hpp |   7 +
 host/host_tensor/include/host_conv.hpp   | 242 ++++++++++++++++++-----
 2 files changed, 203 insertions(+), 46 deletions(-)

diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp
index 4bf2c234941..5068e42a45a 100644
--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -12,6 +12,13 @@ enum ConvTensorLayout
     NHWCc
 };
 
+enum ConvDirection
+{
+    Forward,
+    BackwardData,
+    BackwardWeights
+};
+
 template <typename... InDesc,
           typename... WeiDesc,
           typename ConvStrides,
diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp
index c1228f4832b..c0bbcbd6554 100644
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -8,83 +8,233 @@ template <typename TIn,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void host_direct_convolution(const Tensor<TIn>& in,
-                             const Tensor<TWei>& wei,
+void host_direct_convolution(Tensor<TIn>& in,
+                             Tensor<TWei>& wei,
                              Tensor<TOut>& out,
                              const ConvStrides& conv_strides,
                              const ConvDilations& conv_dilations,
                              const InLeftPads& in_left_pads,
                              const InRightPads&,
-                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+                             const ConvTensorLayout layout = ConvTensorLayout::NCHW,
+                             const ConvDirection dir       = ConvDirection::Forward)
 {
     using namespace ck;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
 
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+    if(dir == ConvDirection::Forward)
+    {
+        auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+            double v = 0;
+            for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
             {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
                 {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
+                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                    for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
                     {
-                        v += static_cast<const double>(in(n, c, hi, wi)) *
-                             static_cast<const double>(wei(k, c, y, x));
+                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[3])
+                        {
+                            v += static_cast<const double>(in(n, c, hi, wi)) *
+                                 static_cast<const double>(wei(k, c, y, x));
+                        }
                     }
                 }
             }
-        }
-        out(n, k, ho, wo) = v;
-    };
+            out(n, k, ho, wo) = v;
+        };
 
-    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+        auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+            double v = 0;
+            for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
             {
-                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
                 {
-                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[2])
+                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                    for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
                     {
-                        v += static_cast<const double>(in(n, hi, wi, c)) *
-                             static_cast<const double>(wei(k, y, x, c));
+                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[2])
+                        {
+                            v += static_cast<const double>(in(n, hi, wi, c)) *
+                                 static_cast<const double>(wei(k, y, x, c));
+                        }
                     }
                 }
             }
-        }
-        out(n, ho, wo, k) = v;
-    };
+            out(n, ho, wo, k) = v;
+        };
 
-    if(layout == ConvTensorLayout::NCHW)
+        if(layout == ConvTensorLayout::NCHW)
+        {
+            make_ParallelTensorFunctor(f_nchw,
+                                       out.mDesc.GetLengths()[0],
+                                       out.mDesc.GetLengths()[1],
+                                       out.mDesc.GetLengths()[2],
+                                       out.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else if(layout == ConvTensorLayout::NHWC)
+        {
+            make_ParallelTensorFunctor(f_nhwc,
+                                       out.mDesc.GetLengths()[0],
+                                       out.mDesc.GetLengths()[1],
+                                       out.mDesc.GetLengths()[2],
+                                       out.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else
+        {
+            throw std::runtime_error("wrong! not supported layout");
+        }
+    }
+    else if(dir == ConvDirection::BackwardData)
     {
-        make_ParallelTensorFunctor(f_nchw,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+            double v = 0;
+            for(int k = 0; k < wei.mDesc.GetLengths()[0]; ++k)
+            {
+                for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+                {
+                    int ho = (hi - y * conv_dilations[I0] + in_left_pads[I0]) / conv_strides[I0];
+                    for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                    {
+                        int wo =
+                            (wi - x * conv_dilations[I1] + in_left_pads[I1]) / conv_strides[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[3])
+                        {
+                            v += static_cast<const double>(out(n, k, ho, wo)) *
+                                 static_cast<const double>(wei(k, c, y, x));
+                        }
+                    }
+                }
+            }
+            in(n, c, hi, wi) = v;
+        };
+
+        auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
+            double v = 0;
+            for(int k = 0; k < wei.mDesc.GetLengths()[0]; ++k)
+            {
+                for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+                {
+                    int ho = (hi - y * conv_dilations[I0] + in_left_pads[I0]) / conv_strides[I0];
+                    for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                    {
+                        int wo =
+                            (wi - x * conv_dilations[I1] + in_left_pads[I1]) / conv_strides[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[2])
+                        {
+                            v += static_cast<const double>(out(n, ho, wo, k)) *
+                                 static_cast<const double>(wei(k, y, x, c));
+                        }
+                    }
+                }
+            }
+            in(n, hi, wi, c) = v;
+        };
+
+        if(layout == ConvTensorLayout::NCHW)
+        {
+            make_ParallelTensorFunctor(f_nchw,
+                                       in.mDesc.GetLengths()[0],
+                                       in.mDesc.GetLengths()[1],
+                                       in.mDesc.GetLengths()[2],
+                                       in.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else if(layout == ConvTensorLayout::NHWC)
+        {
+            make_ParallelTensorFunctor(f_nhwc,
+                                       in.mDesc.GetLengths()[0],
+                                       in.mDesc.GetLengths()[1],
+                                       in.mDesc.GetLengths()[2],
+                                       in.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else
+        {
+            throw std::runtime_error("wrong! not supported layout");
+        }
     }
-    else if(layout == ConvTensorLayout::NHWC)
+    else if(dir == ConvDirection::BackwardWeights)
     {
-        make_ParallelTensorFunctor(f_nhwc,
-                                   out.mDesc.GetLengths()[0],
-                                   out.mDesc.GetLengths()[1],
-                                   out.mDesc.GetLengths()[2],
-                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
+            double v = 0;
+            for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+            {
+                for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
+                {
+                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                    for(int wo = 0; wo < wei.mDesc.GetLengths()[3]; ++wo)
+                    {
+                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[3])
+                        {
+                            v += static_cast<const double>(in(n, c, hi, wi)) *
+                                 static_cast<const double>(out(n, k, ho, wo));
+                        }
+                    }
+                }
+            }
+            wei(k, c, y, x) = v;
+        };
+
+        auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
+            double v = 0;
+            for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+            {
+                for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
+                {
+                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                    for(int wo = 0; wo < wei.mDesc.GetLengths()[2]; ++wo)
+                    {
+                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                           wi < in.mDesc.GetLengths()[2])
+                        {
+                            v += static_cast<const double>(in(n, hi, wi, c)) *
+                                 static_cast<const double>(out(n, ho, wo, k));
+                        }
+                    }
+                }
+            }
+            wei(k, y, x, c) = v;
+        };
+
+        if(layout == ConvTensorLayout::NCHW)
+        {
+            make_ParallelTensorFunctor(f_kcyx,
+                                       wei.mDesc.GetLengths()[0],
+                                       wei.mDesc.GetLengths()[1],
+                                       wei.mDesc.GetLengths()[2],
+                                       wei.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else if(layout == ConvTensorLayout::NHWC)
+        {
+            make_ParallelTensorFunctor(f_kyxc,
+                                       wei.mDesc.GetLengths()[0],
+                                       wei.mDesc.GetLengths()[1],
+                                       wei.mDesc.GetLengths()[2],
+                                       wei.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+        }
+        else
+        {
+            throw std::runtime_error("wrong! not supported layout");
+        }
     }
     else
     {
-        throw std::runtime_error("wrong! not supported layout");
+        throw std::runtime_error("wrong! not supported direction");
     }
 }
 

From 9465cf3e69a00253b84eeef4bf17de8e3621aab2 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@amd.com>
Date: Tue, 17 Aug 2021 15:30:09 +0000
Subject: [PATCH 06/21] remove bwd, seperate wrw

---
 .../src/conv_fwd_driver_offline.cpp           |   3 +-
 host/host_tensor/include/conv_common.hpp      |   7 -
 host/host_tensor/include/host_conv.hpp        | 242 ++++--------------
 host/host_tensor/include/host_conv_wrw.hpp    |  89 +++++++
 4 files changed, 137 insertions(+), 204 deletions(-)
 create mode 100644 host/host_tensor/include/host_conv_wrw.hpp

diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 32c33003c5f..2e36f43d964 100644
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include <half.hpp>
+//#include <half.hpp>
 #include "config.hpp"
 #include "print.hpp"
 #include "device.hpp"
@@ -11,6 +11,7 @@
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
+#include "host_conv_wrw.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
diff --git a/host/host_tensor/include/conv_common.hpp b/host/host_tensor/include/conv_common.hpp
index 5068e42a45a..4bf2c234941 100644
--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
@@ -12,13 +12,6 @@ enum ConvTensorLayout
     NHWCc
 };
 
-enum ConvDirection
-{
-    Forward,
-    BackwardData,
-    BackwardWeights
-};
-
 template <typename... InDesc,
           typename... WeiDesc,
           typename ConvStrides,
diff --git a/host/host_tensor/include/host_conv.hpp b/host/host_tensor/include/host_conv.hpp
index c0bbcbd6554..c1228f4832b 100644
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -8,233 +8,83 @@ template <typename TIn,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
-void host_direct_convolution(Tensor<TIn>& in,
-                             Tensor<TWei>& wei,
+void host_direct_convolution(const Tensor<TIn>& in,
+                             const Tensor<TWei>& wei,
                              Tensor<TOut>& out,
                              const ConvStrides& conv_strides,
                              const ConvDilations& conv_dilations,
                              const InLeftPads& in_left_pads,
                              const InRightPads&,
-                             const ConvTensorLayout layout = ConvTensorLayout::NCHW,
-                             const ConvDirection dir       = ConvDirection::Forward)
+                             const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
     using namespace ck;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
 
-    if(dir == ConvDirection::Forward)
-    {
-        auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-            double v = 0;
-            for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-            {
-                for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-                {
-                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                    for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                    {
-                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[3])
-                        {
-                            v += static_cast<const double>(in(n, c, hi, wi)) *
-                                 static_cast<const double>(wei(k, c, y, x));
-                        }
-                    }
-                }
-            }
-            out(n, k, ho, wo) = v;
-        };
-
-        auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
-            double v = 0;
-            for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
-            {
-                for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
-                {
-                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                    for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
-                    {
-                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[2])
-                        {
-                            v += static_cast<const double>(in(n, hi, wi, c)) *
-                                 static_cast<const double>(wei(k, y, x, c));
-                        }
-                    }
-                }
-            }
-            out(n, ho, wo, k) = v;
-        };
-
-        if(layout == ConvTensorLayout::NCHW)
-        {
-            make_ParallelTensorFunctor(f_nchw,
-                                       out.mDesc.GetLengths()[0],
-                                       out.mDesc.GetLengths()[1],
-                                       out.mDesc.GetLengths()[2],
-                                       out.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else if(layout == ConvTensorLayout::NHWC)
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
         {
-            make_ParallelTensorFunctor(f_nhwc,
-                                       out.mDesc.GetLengths()[0],
-                                       out.mDesc.GetLengths()[1],
-                                       out.mDesc.GetLengths()[2],
-                                       out.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else
-        {
-            throw std::runtime_error("wrong! not supported layout");
-        }
-    }
-    else if(dir == ConvDirection::BackwardData)
-    {
-        auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
-            double v = 0;
-            for(int k = 0; k < wei.mDesc.GetLengths()[0]; ++k)
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
             {
-                for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
                 {
-                    int ho = (hi - y * conv_dilations[I0] + in_left_pads[I0]) / conv_strides[I0];
-                    for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
                     {
-                        int wo =
-                            (wi - x * conv_dilations[I1] + in_left_pads[I1]) / conv_strides[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[3])
-                        {
-                            v += static_cast<const double>(out(n, k, ho, wo)) *
-                                 static_cast<const double>(wei(k, c, y, x));
-                        }
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(wei(k, c, y, x));
                     }
                 }
             }
-            in(n, c, hi, wi) = v;
-        };
+        }
+        out(n, k, ho, wo) = v;
+    };
 
-        auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
-            double v = 0;
-            for(int k = 0; k < wei.mDesc.GetLengths()[0]; ++k)
+    auto f_nhwc = [&](auto n, auto ho, auto wo, auto k) {
+        double v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[3]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
             {
-                for(int y = 0; y < wei.mDesc.GetLengths()[1]; ++y)
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
                 {
-                    int ho = (hi - y * conv_dilations[I0] + in_left_pads[I0]) / conv_strides[I0];
-                    for(int x = 0; x < wei.mDesc.GetLengths()[2]; ++x)
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
                     {
-                        int wo =
-                            (wi - x * conv_dilations[I1] + in_left_pads[I1]) / conv_strides[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[2])
-                        {
-                            v += static_cast<const double>(out(n, ho, wo, k)) *
-                                 static_cast<const double>(wei(k, y, x, c));
-                        }
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(wei(k, y, x, c));
                     }
                 }
             }
-            in(n, hi, wi, c) = v;
-        };
-
-        if(layout == ConvTensorLayout::NCHW)
-        {
-            make_ParallelTensorFunctor(f_nchw,
-                                       in.mDesc.GetLengths()[0],
-                                       in.mDesc.GetLengths()[1],
-                                       in.mDesc.GetLengths()[2],
-                                       in.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else if(layout == ConvTensorLayout::NHWC)
-        {
-            make_ParallelTensorFunctor(f_nhwc,
-                                       in.mDesc.GetLengths()[0],
-                                       in.mDesc.GetLengths()[1],
-                                       in.mDesc.GetLengths()[2],
-                                       in.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else
-        {
-            throw std::runtime_error("wrong! not supported layout");
         }
+        out(n, ho, wo, k) = v;
+    };
+
+    if(layout == ConvTensorLayout::NCHW)
+    {
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
     }
-    else if(dir == ConvDirection::BackwardWeights)
+    else if(layout == ConvTensorLayout::NHWC)
     {
-        auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
-            double v = 0;
-            for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-            {
-                for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
-                {
-                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                    for(int wo = 0; wo < wei.mDesc.GetLengths()[3]; ++wo)
-                    {
-                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[3])
-                        {
-                            v += static_cast<const double>(in(n, c, hi, wi)) *
-                                 static_cast<const double>(out(n, k, ho, wo));
-                        }
-                    }
-                }
-            }
-            wei(k, c, y, x) = v;
-        };
-
-        auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
-            double v = 0;
-            for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
-            {
-                for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
-                {
-                    int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                    for(int wo = 0; wo < wei.mDesc.GetLengths()[2]; ++wo)
-                    {
-                        int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
-                        if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
-                           wi < in.mDesc.GetLengths()[2])
-                        {
-                            v += static_cast<const double>(in(n, hi, wi, c)) *
-                                 static_cast<const double>(out(n, ho, wo, k));
-                        }
-                    }
-                }
-            }
-            wei(k, y, x, c) = v;
-        };
-
-        if(layout == ConvTensorLayout::NCHW)
-        {
-            make_ParallelTensorFunctor(f_kcyx,
-                                       wei.mDesc.GetLengths()[0],
-                                       wei.mDesc.GetLengths()[1],
-                                       wei.mDesc.GetLengths()[2],
-                                       wei.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else if(layout == ConvTensorLayout::NHWC)
-        {
-            make_ParallelTensorFunctor(f_kyxc,
-                                       wei.mDesc.GetLengths()[0],
-                                       wei.mDesc.GetLengths()[1],
-                                       wei.mDesc.GetLengths()[2],
-                                       wei.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-        }
-        else
-        {
-            throw std::runtime_error("wrong! not supported layout");
-        }
+        make_ParallelTensorFunctor(f_nhwc,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
     }
     else
     {
-        throw std::runtime_error("wrong! not supported direction");
+        throw std::runtime_error("wrong! not supported layout");
     }
 }
 
diff --git a/host/host_tensor/include/host_conv_wrw.hpp b/host/host_tensor/include/host_conv_wrw.hpp
new file mode 100644
index 00000000000..0d57301d93d
--- /dev/null
+++ b/host/host_tensor/include/host_conv_wrw.hpp
@@ -0,0 +1,89 @@
+#pragma once
+#include "host_tensor.hpp"
+
+template <typename TIn,
+          typename TOut,
+          typename TWei,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_weights(
+    const Tensor<TIn>& in,
+    const Tensor<TOut>& out,
+    Tensor<TWei>& wei,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads&,
+    const ConvTensorLayout layout = ConvTensorLayout::NCHW)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    auto f_kcyx       = [&](auto k, auto c, auto y, auto x) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < wei.mDesc.GetLengths()[3]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += static_cast<const double>(in(n, c, hi, wi)) *
+                             static_cast<const double>(out(n, k, ho, wo));
+                    }
+                }
+            }
+        }
+        wei(k, c, y, x) = v;
+    };
+
+    auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
+        double v = 0;
+        for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
+        {
+            for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int wo = 0; wo < wei.mDesc.GetLengths()[2]; ++wo)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[2])
+                    {
+                        v += static_cast<const double>(in(n, hi, wi, c)) *
+                             static_cast<const double>(out(n, ho, wo, k));
+                    }
+                }
+            }
+        }
+        wei(k, y, x, c) = v;
+    };
+
+    if(layout == ConvTensorLayout::NCHW)
+    {
+        make_ParallelTensorFunctor(f_kcyx,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else if(layout == ConvTensorLayout::NHWC)
+    {
+        make_ParallelTensorFunctor(f_kyxc,
+                                   wei.mDesc.GetLengths()[0],
+                                   wei.mDesc.GetLengths()[1],
+                                   wei.mDesc.GetLengths()[2],
+                                   wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        throw std::runtime_error("wrong! not supported layout");
+    }
+}

From 4910398713d9bea68ec3bf055a8f64cd845064cc Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@amd.com>
Date: Tue, 17 Aug 2021 15:30:38 +0000
Subject: [PATCH 07/21] clean

---
 host/driver_offline/src/conv_fwd_driver_offline.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 2e36f43d964..32c33003c5f 100644
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-//#include <half.hpp>
+#include <half.hpp>
 #include "config.hpp"
 #include "print.hpp"
 #include "device.hpp"
@@ -11,7 +11,6 @@
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
-#include "host_conv_wrw.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"

From fe26049e8e63a6ef2816f6c8584c1578f1ef0ac0 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Wed, 18 Aug 2021 12:07:01 +0800
Subject: [PATCH 08/21] hacall k to zero

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 26 +++++++++----------
 .../src/conv_wrw_driver_offline.cpp           | 12 ++++-----
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index dcda7a29e45..bdb9533405c 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -98,36 +98,36 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
     constexpr auto wei_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{}),
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}),
                    make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{}));
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{}));
 
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index ae61045cb99..371b19e8478 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -10,7 +10,7 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv.hpp"
+#include "host_conv_wrw.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 
@@ -275,9 +275,9 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_direct_convolution(in,
+        host_direct_convolution_backward_weights(in,
+                                out, 
                                 wei_host,
-                                out,
                                 make_tuple(conv_stride_h, conv_stride_w),
                                 make_tuple(conv_dilation_h, conv_dilation_w),
                                 make_tuple(in_left_pad_h, in_left_pad_w),
@@ -288,10 +288,10 @@ int main(int argc, char* argv[])
 
         if(do_log)
         {
-            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+         //   LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
+         //   LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
             LogRangeAsType<float>(std::cout << "wei_device: ", wei_device.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
-            LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
+         //   LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
         }
     }
 }

From c27a57d4fe793823df35073b362ffdb1dd4d33ed Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Wed, 18 Aug 2021 15:01:21 +0800
Subject: [PATCH 09/21] out log

---
 host/driver_offline/src/conv_wrw_driver_offline.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 371b19e8478..6771a988db6 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -288,10 +288,10 @@ int main(int argc, char* argv[])
 
         if(do_log)
         {
-         //   LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
-         //   LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "out: ", out.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
             LogRangeAsType<float>(std::cout << "wei_device: ", wei_device.mData, ",") << std::endl;
-         //   LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei_host  : ", wei_host.mData, ",") << std::endl;
         }
     }
 }

From 115c5bfc6351ecbfae6f17a53b131dc65e445339 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@amd.com>
Date: Wed, 18 Aug 2021 12:40:26 +0000
Subject: [PATCH 10/21] fixed

---
 host/host_tensor/include/host_conv_wrw.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/host/host_tensor/include/host_conv_wrw.hpp b/host/host_tensor/include/host_conv_wrw.hpp
index 0d57301d93d..b5716395b66 100644
--- a/host/host_tensor/include/host_conv_wrw.hpp
+++ b/host/host_tensor/include/host_conv_wrw.hpp
@@ -29,7 +29,7 @@ void host_direct_convolution_backward_weights(
             for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
             {
                 int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < wei.mDesc.GetLengths()[3]; ++wo)
+                for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
                 {
                     int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
                     if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
@@ -51,7 +51,7 @@ void host_direct_convolution_backward_weights(
             for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
             {
                 int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
-                for(int wo = 0; wo < wei.mDesc.GetLengths()[2]; ++wo)
+                for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
                 {
                     int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
                     if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&

From fca3500e938385b8ce00989198dcde797f6c45d3 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jizhan@amd.com>
Date: Wed, 18 Aug 2021 12:52:11 +0000
Subject: [PATCH 11/21] fixed

---
 host/host_tensor/include/host_conv_wrw.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/host/host_tensor/include/host_conv_wrw.hpp b/host/host_tensor/include/host_conv_wrw.hpp
index b5716395b66..ed3e8c3042e 100644
--- a/host/host_tensor/include/host_conv_wrw.hpp
+++ b/host/host_tensor/include/host_conv_wrw.hpp
@@ -1,16 +1,16 @@
 #pragma once
 #include "host_tensor.hpp"
 
-template <typename TIn,
-          typename TOut,
+template <typename TOut,
+          typename TIn,
           typename TWei,
           typename ConvStrides,
           typename ConvDilations,
           typename InLeftPads,
           typename InRightPads>
 void host_direct_convolution_backward_weights(
-    const Tensor<TIn>& in,
     const Tensor<TOut>& out,
+    const Tensor<TIn>& in,
     Tensor<TWei>& wei,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,

From 252d271cc49027613c7941798f49264484baa4b9 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Thu, 19 Aug 2021 09:20:02 +0800
Subject: [PATCH 12/21] change to (out in wei)

---
 host/driver_offline/src/conv_wrw_driver_offline.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 6771a988db6..6c8fe129354 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -275,8 +275,8 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_direct_convolution_backward_weights(in,
-                                out, 
+        host_direct_convolution_backward_weights(out, 
+                                in,
                                 wei_host,
                                 make_tuple(conv_stride_h, conv_stride_w),
                                 make_tuple(conv_dilation_h, conv_dilation_w),

From 2d194c52b6833c95a8ea6d94edf264abb1ecd74f Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Thu, 19 Aug 2021 11:44:32 +0800
Subject: [PATCH 13/21] input hack

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 24 +++++++++----------
 .../src/conv_wrw_driver_offline.cpp           |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index bdb9533405c..ae04152c431 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -51,7 +51,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
     constexpr index_t BlockSize = 256;
 
-    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmMPerBlock = 128;
     constexpr index_t GemmNPerBlock = 128;
     constexpr index_t GemmKPerBlock = 4;
 
@@ -59,10 +59,10 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     constexpr index_t GemmNPerWave = 32;
     constexpr index_t GemmK1       = 8;
 
-    constexpr index_t MRepeat = 4;
+    constexpr index_t MRepeat = 2;
     constexpr index_t NRepeat = 2;
 
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
     using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
 
     constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
@@ -98,12 +98,12 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
 
     constexpr auto wei_m0_m1_m2_n_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
@@ -127,7 +127,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
         Sequence<0, 0, 0, 0, 0>{};
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0>{};
 
     for(index_t i = 0; i < 5; ++i)
     {
@@ -158,9 +158,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             false, // don't move back src coordinate after threadwise copy
             GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
             GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
-            Sequence<0, 2, 1>,
             Sequence<1, 0, 2>,
-            1,
+            Sequence<1, 0, 2>,
+            2,
             GemmBBlockTransferSrcScalarPerVector_GemmN,
             GemmBBlockTransferDstScalarPerVector_GemmK1,
             false, // don't move back src coordinate after threadwise copy
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 6c8fe129354..6e4a7d77315 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -201,8 +201,8 @@ int main(int argc, char* argv[])
         out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         break;
     case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 0.01}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 0.01}, num_thread);
         break;
     default:
         in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);

From 1e9c511ceedd1d5d928fa71b27e8dfdf5581d3bd Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Thu, 19 Aug 2021 17:25:28 +0800
Subject: [PATCH 14/21] hack to out

---
 ...ight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index ae04152c431..9fbd07d9ec5 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -93,9 +93,13 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(Sequence<0, 0, 1, 0, 0>{}, 
+                   Sequence<0, 0, 0, 0, 0>{}, 
+                   Sequence<0, 0, 1, 0, 0>{}),
         make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+                   Sequence<0, 0, 2, 0, 0>{}, 
+                   Sequence<0, 0, 0, 0, 0>{}, 
+                   Sequence<0, 0, 2, 0, 0>{}));
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
@@ -124,7 +128,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0>{}));
 
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0>{};
+        Sequence<0, 0, 1, 0, 0>{};
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0>{};

From 5cfd01fd211a527353c3e0df200c6a4be5a2ec13 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Fri, 20 Aug 2021 10:19:06 +0800
Subject: [PATCH 15/21] format

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 26 +++++-----
 .../src/conv_wrw_driver_offline.cpp           | 48 +++++++++----------
 2 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 9fbd07d9ec5..4352d4469d3 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -77,15 +77,15 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
 #endif
 
-    const auto descs =
-        transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
-                                                                          in_n_c_hi_wi_desc,
-                                                                          out_n_k_ho_wo_desc,
-                                                                          conv_strides,
-                                                                          conv_dilations,
-                                                                          in_left_pads,
-                                                                          in_right_pads,
-                                                                          Number<GemmK1>{});
+    const auto descs = transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+        wei_k_c_y_x_desc,
+        in_n_c_hi_wi_desc,
+        out_n_k_ho_wo_desc,
+        conv_strides,
+        conv_dilations,
+        in_left_pads,
+        in_right_pads,
+        Number<GemmK1>{});
 
     const auto out_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
     const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
@@ -93,13 +93,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 1, 0, 0>{}, 
-                   Sequence<0, 0, 0, 0, 0>{}, 
-                   Sequence<0, 0, 1, 0, 0>{}),
+        make_tuple(Sequence<0, 0, 1, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 1, 0, 0>{}),
         make_tuple(
-                   Sequence<0, 0, 2, 0, 0>{}, 
-                   Sequence<0, 0, 0, 0, 0>{}, 
-                   Sequence<0, 0, 2, 0, 0>{}));
+            Sequence<0, 0, 2, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 2, 0, 0>{}));
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
         make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 6e4a7d77315..753bda31be5 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -14,13 +14,12 @@
 #include "device_tensor.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 
-
 #define USE_MODE 1
 #define USE_CONV_WRW_V4R4R2_XDL_NCHW 1
 
 enum ConvBackwardWeightAlgo
 {
-    V4R4R2XDLNCHW, 
+    V4R4R2XDLNCHW,
 };
 
 int main(int argc, char* argv[])
@@ -44,12 +43,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardWeightAlgo algo    = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = std::stoi(argv[6]);
+    const ConvTensorLayout layout     = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
+    const bool do_verification        = std::stoi(argv[3]);
+    const int init_method             = std::stoi(argv[4]);
+    const bool do_log                 = std::stoi(argv[5]);
+    const int nrepeat                 = std::stoi(argv[6]);
 
     const index_t N  = std::stoi(argv[7]);
     const index_t K  = std::stoi(argv[8]);
@@ -81,12 +80,12 @@ int main(int argc, char* argv[])
         exit(1);
     }
 
-    const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
-    const ConvBackwardWeightAlgo algo    = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
-    const bool do_verification    = std::stoi(argv[3]);
-    const int init_method         = std::stoi(argv[4]);
-    const bool do_log             = std::stoi(argv[5]);
-    const int nrepeat             = std::stoi(argv[6]);
+    const ConvTensorLayout layout     = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
+    const ConvBackwardWeightAlgo algo = static_cast<ConvBackwardWeightAlgo>(std::stoi(argv[2]));
+    const bool do_verification        = std::stoi(argv[3]);
+    const int init_method             = std::stoi(argv[4]);
+    const bool do_log                 = std::stoi(argv[5]);
+    const int nrepeat                 = std::stoi(argv[6]);
 
     constexpr index_t N  = 128;
     constexpr index_t C  = 192;
@@ -245,7 +244,6 @@ int main(int argc, char* argv[])
                           in_right_pads_dev);
     };
 
-
 #if USE_CONV_WRW_V4R4R2_XDL_NCHW
     if(algo == ConvBackwardWeightAlgo::V4R4R2XDLNCHW)
     {
@@ -257,8 +255,8 @@ int main(int argc, char* argv[])
         const auto tmp = f_make_for_device_nchw();
 
         device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
-                                                                              acc_data_t,
-                                                                              out_data_t>(
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
             tmp[I0],
             tmp[I1],
             tmp[I2],
@@ -275,14 +273,14 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_direct_convolution_backward_weights(out, 
-                                in,
-                                wei_host,
-                                make_tuple(conv_stride_h, conv_stride_w),
-                                make_tuple(conv_dilation_h, conv_dilation_w),
-                                make_tuple(in_left_pad_h, in_left_pad_w),
-                                make_tuple(in_right_pad_h, in_right_pad_w),
-                                layout);
+        host_direct_convolution_backward_weights(out,
+                                                 in,
+                                                 wei_host,
+                                                 make_tuple(conv_stride_h, conv_stride_w),
+                                                 make_tuple(conv_dilation_h, conv_dilation_w),
+                                                 make_tuple(in_left_pad_h, in_left_pad_w),
+                                                 make_tuple(in_right_pad_h, in_right_pad_w),
+                                                 layout);
 
         check_error(wei_host, wei_device);
 

From e6d9dd20b1194951723c62403c53ffe87dae4e80 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Wed, 25 Aug 2021 15:11:00 +0800
Subject: [PATCH 16/21] fix by comments

---
 ...olution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  4 ++--
 ...licit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp |  2 +-
 .../src/conv_wrw_driver_offline.cpp            | 18 +++++++++---------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
index 367e39c2cc1..8db6bb618e0 100644
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -8,8 +8,8 @@
 namespace ck {
 
 // GemmM = K
-// GemmN = N * Ho * Wo
-// GemmK = C * Y * X
+// GemmK = N * Ho * Wo
+// GemmN = C * Y * X
 template <typename... Wei,
           typename... In,
           typename... Out,
diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 4352d4469d3..c08c0ee47bf 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -48,7 +48,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
 #if 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
     constexpr index_t BlockSize = 256;
 
     constexpr index_t GemmMPerBlock = 128;
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 753bda31be5..52303493fc3 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -87,13 +87,13 @@ int main(int argc, char* argv[])
     const bool do_log                 = std::stoi(argv[5]);
     const int nrepeat                 = std::stoi(argv[6]);
 
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t Hi = 71;
-    constexpr index_t Wi = 71;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t N  = Number<128>;
+    constexpr index_t C  = Number<128>;
+    constexpr index_t Hi = Number<14>;
+    constexpr index_t Wi = Number<14>;
+    constexpr index_t K  = Number<256>;
+    constexpr index_t Y  = Number<3>;
+    constexpr index_t X  = Number<3>;
 
     const index_t conv_stride_h   = 2;
     const index_t conv_stride_w   = 2;
@@ -200,8 +200,8 @@ int main(int argc, char* argv[])
         out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         break;
     case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 0.01}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 0.01}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_3<float>{-0.01, 0.01}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.01, 0.01}, num_thread);
         break;
     default:
         in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);

From a34c34a47e64b33439505562e29bfc62b2ccf420 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Wed, 25 Aug 2021 15:32:12 +0800
Subject: [PATCH 17/21] change wei hacks(wei transform has not merge)

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index c08c0ee47bf..2639c6c858b 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -105,23 +105,23 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
                               Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
 
-    constexpr auto wei_m0_m1_m2_n_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto wei_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
 
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 1, 0, 0>{};
@@ -169,7 +169,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
             GemmCThreadTransferDstScalarPerVector,
             decltype(out_gemmk0_gemmm_gemmk1_grid_step_hacks),
             decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
-            decltype(wei_m0_m1_m2_n_grid_step_hacks),
+            decltype(wei_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks),
             decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
             decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
             false>(static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
@@ -180,7 +180,7 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
                    wei_gemmm_gemmn_grid_desc,
                    out_gemmk0_gemmm_gemmk1_grid_step_hacks,
                    in_gemmk0_gemmn_gemmk1_grid_step_hacks,
-                   wei_m0_m1_m2_n_grid_step_hacks,
+                   wei_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks,
                    out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
                    in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
                    nrepeat);

From b2ea9aa9ab61c69766cf505a6b2c695cc6592500 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Thu, 26 Aug 2021 18:54:21 +0800
Subject: [PATCH 18/21] fix program once issue

---
 ...ard_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
index 8db6bb618e0..949f044b7dd 100644
--- a/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
-#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
 
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"

From 0e08b3b359b3d089e4c1dfaacae5a633e3efdd5c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 27 Aug 2021 20:21:53 -0500
Subject: [PATCH 19/21] fix review comment

---
 ...plicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp |  24 +-
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp |  55 ++--
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp |  85 +++--
 ...icit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp | 302 ------------------
 ...icit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp |  32 +-
 .../src/conv_bwd_driver_offline.cpp           |  54 ++--
 .../src/conv_fwd_driver_offline.cpp           |  86 ++---
 .../src/conv_wrw_driver_offline.cpp           |  72 ++---
 ..._conv_wrw.hpp => host_conv_bwd_weight.hpp} |   0
 9 files changed, 200 insertions(+), 510 deletions(-)
 delete mode 100644 host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
 rename host/host_tensor/include/{host_conv_wrw.hpp => host_conv_bwd_weight.hpp} (100%)

diff --git a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
index 7196f3c1790..8f494735637 100644
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -208,20 +208,20 @@ void device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
     constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto out_gemmk0_gemmn_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
-        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
-                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: GemmN
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: GemmK0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: GemmN
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
     // clang-format off
     constexpr auto in_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks = make_tuple(
diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 2639c6c858b..445cf229817 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -92,36 +92,39 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     const auto wei_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 1, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 1, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 2, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 2, 0, 0>{}));
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 1, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 1, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 2, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 2, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 1+: GemmN
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 1-: GemmN
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto wei_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
 
     constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 1, 0, 0>{};
diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index dc4f5eafb67..d65ecadb4df 100644
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -47,7 +47,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     const auto wei_k_c_y_x_desc   = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
     const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
 
-#if 1
+#if 0
     // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
     constexpr index_t BlockSize = 256;
 
@@ -74,6 +74,34 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
 
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
 #endif
 
@@ -92,36 +120,39 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
     const auto out_gemmm_gemmn_grid_desc         = descs[I2];
 
     // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmN
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmN
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{}));
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
 
     constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0>{};
diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
deleted file mode 100644
index 692751bfb3b..00000000000
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ /dev/null
@@ -1,302 +0,0 @@
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
-#include "driver_gemm_xdlops_v2r3.hpp"
-
-template <typename TInWei,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
-    const InLengths& in_n_hi_wi_c_lengths,
-    const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_hi_wi_c,
-    const Tensor<TInWei>& wei_k_y_x_c,
-    Tensor<TOut>& out_n_ho_wo_k,
-    ck::index_t nrepeat)
-{
-    using namespace ck;
-
-    std::cout << __func__ << std::endl;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto I4 = Number<4>{};
-    constexpr auto I5 = Number<5>{};
-    constexpr auto I6 = Number<6>{};
-    constexpr auto I7 = Number<7>{};
-    constexpr auto I8 = Number<8>{};
-
-    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
-    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
-    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
-
-    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
-    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
-    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
-
-    const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
-    const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
-
-#if 1
-    // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 4;
-
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
-
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
-
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t GemmMPerBlock = 128;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 4;
-
-    constexpr index_t MRepeat = 2;
-    constexpr index_t NRepeat = 2;
-
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
-
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
-
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
-#elif 0
-    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 256;
-    constexpr index_t GemmKPerBlock = 4;
-
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 8;
-
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 4;
-
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
-#elif 1
-    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t GemmMPerBlock = 256;
-    constexpr index_t GemmNPerBlock = 128;
-    constexpr index_t GemmKPerBlock = 4;
-
-    constexpr index_t GemmMPerWave = 32;
-    constexpr index_t GemmNPerWave = 32;
-    constexpr index_t GemmK1       = 8;
-
-    constexpr index_t MRepeat = 4;
-    constexpr index_t NRepeat = 2;
-
-    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
-    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
-
-    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
-    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
-
-    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
-#endif
-
-    const auto descs =
-        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
-                                                                          in_n_hi_wi_c_desc,
-                                                                          out_n_ho_wo_k_desc,
-                                                                          conv_strides,
-                                                                          conv_dilations,
-                                                                          in_left_pads,
-                                                                          in_right_pads,
-                                                                          Number<GemmK1>{});
-
-    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
-    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
-    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
-
-    // HACK: hacks that control index calculation when iterating over A, B, C matrix
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_step_hacks = make_tuple(
-        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
-        make_tuple(
-            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
-
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
-
-    constexpr auto out_m0_m1_m2_n_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 1, 0, 0>{}),
-                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 0, 0, 0>{},
-                              Sequence<0, 0, 2, 0, 0>{}));
-
-    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0>{};
-
-    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks =
-        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
-
-    for(index_t i = 0; i < 5; ++i)
-    {
-        float ave_time = driver_gemm_xdlops_v2r3<
-            BlockSize,
-            TInWei,
-            TAcc,
-            TOut,
-            InMemoryDataOperationEnum_t::Set,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
-            decltype(out_gemmm_gemmn_grid_desc),
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmMPerWave,
-            GemmNPerWave,
-            MRepeat,
-            NRepeat,
-            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
-            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
-            Sequence<1, 0, 2>,
-            Sequence<1, 0, 2>,
-            2,
-            GemmABlockTransferSrcScalarPerVector_GemmK1,
-            GemmABlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
-            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
-            Sequence<1, 0, 2>,
-            Sequence<1, 0, 2>,
-            2,
-            GemmBBlockTransferSrcScalarPerVector_GemmK1,
-            GemmBBlockTransferDstScalarPerVector_GemmK1,
-            false, // don't move back src coordinate after threadwise copy
-            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
-            6,
-            GemmCThreadTransferDstScalarPerVector,
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_step_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_step_hacks),
-            decltype(out_m0_m1_m2_n_grid_step_hacks),
-            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks),
-            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks),
-            false // CAccessOrderMRepeatNRepeat
-            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
-              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
-              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
-              wei_gemmk0_gemmm_gemmk1_grid_desc,
-              in_gemmk0_gemmn_gemmk1_grid_desc,
-              out_gemmm_gemmn_grid_desc,
-              wei_gemmk0_gemmm_gemmk1_grid_step_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_step_hacks,
-              out_m0_m1_m2_n_grid_step_hacks,
-              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks,
-              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_step_hacks,
-              nrepeat);
-
-        {
-            const auto N = out_n_ho_wo_k_lengths[I0];
-            const auto K = out_n_ho_wo_k_lengths[I3];
-            const auto C = wei_k_y_x_c_lengths[I3];
-
-            const auto Hi = in_n_hi_wi_c_lengths[I1];
-            const auto Wi = in_n_hi_wi_c_lengths[I2];
-
-            const auto Ho = out_n_ho_wo_k_lengths[I1];
-            const auto Wo = out_n_ho_wo_k_lengths[I2];
-
-            const auto Y = wei_k_y_x_c_lengths[I1];
-            const auto X = wei_k_y_x_c_lengths[I2];
-
-            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
-                         (std::size_t(1000) * 1000 * 1000) / ave_time;
-
-            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
-                      << std::endl;
-        }
-    }
-
-    // copy result back to host
-    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
-}
diff --git a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
index 5ff8dfb665a..52432664dea 100644
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -250,22 +250,22 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
                               Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
 
     constexpr auto out_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks =
-        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: MRepeat
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: NRepeat
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: MWaves
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: NWaves
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N1
-                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: MRepeat
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: NRepeat
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: MWaves
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: NWaves
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M0
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M1
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
-                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0+: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2+: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3+: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4+: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5+: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6+: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 7+: N2
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: M0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: N0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: M1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: N1
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M2
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M3
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M4
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N2
 
     constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_step_hacks =
         Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
diff --git a/host/driver_offline/src/conv_bwd_driver_offline.cpp b/host/driver_offline/src/conv_bwd_driver_offline.cpp
index 67cea948130..4e93ada8590 100644
--- a/host/driver_offline/src/conv_bwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_bwd_driver_offline.cpp
@@ -41,7 +41,7 @@ int main(int argc, char* argv[])
     // dynamic mode
     if(argc != 22)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
         exit(1);
     }
@@ -79,7 +79,7 @@ int main(int argc, char* argv[])
     // static mode
     if(argc < 7)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         exit(1);
     }
 
@@ -90,28 +90,28 @@ int main(int argc, char* argv[])
     const bool do_log               = std::stoi(argv[5]);
     const int nrepeat               = std::stoi(argv[6]);
 
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t Hi = 71;
-    constexpr index_t Wi = 71;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    const index_t conv_stride_h   = 2;
-    const index_t conv_stride_w   = 2;
-    const index_t conv_dilation_h = 1;
-    const index_t conv_dilation_w = 1;
-    const index_t in_left_pad_h   = 1;
-    const index_t in_left_pad_w   = 1;
-    const index_t in_right_pad_h  = 1;
-    const index_t in_right_pad_w  = 1;
-
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    constexpr auto N  = Number<128>{};
+    constexpr auto C  = Number<192>{};
+    constexpr auto Hi = Number<71>{};
+    constexpr auto Wi = Number<71>{};
+    constexpr auto K  = Number<256>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+
+    constexpr auto conv_stride_h   = I2;
+    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
+
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
+
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
 #endif
 
 #if 0
@@ -119,9 +119,9 @@ int main(int argc, char* argv[])
     using acc_data_t                 = float;
     using out_data_t                 = float;
 #elif 1
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
+    using in_data_t   = half_t;
+    using acc_data_t  = float;
+    using out_data_t  = half_t;
 #endif
 
     std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp
index 21acb357320..34d7247f3c8 100644
--- a/host/driver_offline/src/conv_fwd_driver_offline.cpp
+++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp
@@ -19,7 +19,7 @@
 #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
 
-#define USE_MODE 1
+#define USE_DYNAMIC_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4R2_NHWC 0
 #define USE_CONV_FWD_V6R1_NCHW 0
@@ -49,11 +49,11 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_MODE
+#if USE_DYNAMIC_MODE
     // dynamic mode
     if(argc != 22)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
         exit(1);
     }
@@ -91,7 +91,7 @@ int main(int argc, char* argv[])
     // static mode
     if(argc < 7)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         exit(1);
     }
 
@@ -102,28 +102,28 @@ int main(int argc, char* argv[])
     const bool do_log             = std::stoi(argv[5]);
     const int nrepeat             = std::stoi(argv[6]);
 
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t Hi = 71;
-    constexpr index_t Wi = 71;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    const index_t conv_stride_h   = 2;
-    const index_t conv_stride_w   = 2;
-    const index_t conv_dilation_h = 1;
-    const index_t conv_dilation_w = 1;
-    const index_t in_left_pad_h   = 1;
-    const index_t in_left_pad_w   = 1;
-    const index_t in_right_pad_h  = 1;
-    const index_t in_right_pad_w  = 1;
-
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    constexpr auto N  = Number<128>{};
+    constexpr auto C  = Number<192>{};
+    constexpr auto Hi = Number<71>{};
+    constexpr auto Wi = Number<71>{};
+    constexpr auto K  = Number<256>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
+
+    constexpr auto conv_stride_h   = I2;
+    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
+
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
+
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
 #endif
 
 #if 0
@@ -131,9 +131,9 @@ int main(int argc, char* argv[])
     using acc_data_t = float;
     using out_data_t = float;
 #elif 1
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
+    using in_data_t   = half_t;
+    using acc_data_t  = float;
+    using out_data_t  = half_t;
 #elif 1
     using in_data_t  = int8_t;
     using acc_data_t = int32_t;
@@ -228,7 +228,6 @@ int main(int argc, char* argv[])
     }
 
     auto f_make_for_device_nchw = [&]() {
-#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
         const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
         const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
@@ -236,19 +235,6 @@ int main(int argc, char* argv[])
         const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
         const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
         const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
 
         return make_tuple(in_lengths_dev,
                           wei_lengths_dev,
@@ -260,7 +246,6 @@ int main(int argc, char* argv[])
     };
 
     auto f_make_for_device_nhwc = [&]() {
-#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
         const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
         const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
@@ -268,19 +253,6 @@ int main(int argc, char* argv[])
         const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
         const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
         const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
 
         return make_tuple(in_lengths_dev,
                           wei_lengths_dev,
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index 52303493fc3..b83fb9f7543 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -10,11 +10,11 @@
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
-#include "host_conv_wrw.hpp"
+#include "host_conv_bwd_weight.hpp"
 #include "device_tensor.hpp"
 #include "device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
 
-#define USE_MODE 1
+#define USE_DYNAMIC_MODE 1
 #define USE_CONV_WRW_V4R4R2_XDL_NCHW 1
 
 enum ConvBackwardWeightAlgo
@@ -34,11 +34,11 @@ int main(int argc, char* argv[])
     constexpr auto I5 = Number<5>{};
     constexpr auto I6 = Number<6>{};
 
-#if USE_MODE
+#if USE_DYNAMIC_MODE
     // dynamic mode
     if(argc != 22)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
         exit(1);
     }
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
     // static mode
     if(argc < 7)
     {
-        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("arg1 to 6: layout, algo, do_verification, init_method, do_log, nrepeat\n");
         exit(1);
     }
 
@@ -87,28 +87,28 @@ int main(int argc, char* argv[])
     const bool do_log                 = std::stoi(argv[5]);
     const int nrepeat                 = std::stoi(argv[6]);
 
-    constexpr index_t N  = Number<128>;
-    constexpr index_t C  = Number<128>;
-    constexpr index_t Hi = Number<14>;
-    constexpr index_t Wi = Number<14>;
-    constexpr index_t K  = Number<256>;
-    constexpr index_t Y  = Number<3>;
-    constexpr index_t X  = Number<3>;
+    constexpr auto N  = Number<128>{};
+    constexpr auto C  = Number<128>{};
+    constexpr auto Hi = Number<14>{};
+    constexpr auto Wi = Number<14>{};
+    constexpr auto K  = Number<256>{};
+    constexpr auto Y  = Number<3>{};
+    constexpr auto X  = Number<3>{};
 
-    const index_t conv_stride_h   = 2;
-    const index_t conv_stride_w   = 2;
-    const index_t conv_dilation_h = 1;
-    const index_t conv_dilation_w = 1;
-    const index_t in_left_pad_h   = 1;
-    const index_t in_left_pad_w   = 1;
-    const index_t in_right_pad_h  = 1;
-    const index_t in_right_pad_w  = 1;
+    constexpr auto conv_stride_h   = I2;
+    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_dilation_h = I1;
+    constexpr auto conv_dilation_w = I1;
+    constexpr auto in_left_pad_h   = I1;
+    constexpr auto in_left_pad_w   = I1;
+    constexpr auto in_right_pad_h  = I1;
+    constexpr auto in_right_pad_w  = I1;
 
-    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+    constexpr auto YEff = (Y - I1) * conv_dilation_h + I1;
+    constexpr auto XEff = (X - I1) * conv_dilation_w + I1;
 
-    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    constexpr auto Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + I1;
+    constexpr auto Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + I1;
 #endif
 
 #if 1
@@ -116,9 +116,9 @@ int main(int argc, char* argv[])
     using acc_data_t = float;
     using out_data_t = float;
 #elif 1
-    using in_data_t  = half_t;
-    using acc_data_t = float;
-    using out_data_t = half_t;
+    using in_data_t   = half_t;
+    using acc_data_t  = float;
+    using out_data_t  = half_t;
 #elif 1
     using in_data_t  = int8_t;
     using acc_data_t = int32_t;
@@ -200,8 +200,8 @@ int main(int argc, char* argv[])
         out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         break;
     case 5:
-        in.GenerateTensorValue(GeneratorTensor_3<float>{-0.01, 0.01}, num_thread);
-        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.01, 0.01}, num_thread);
+        in.GenerateTensorValue(GeneratorTensor_3<float>{-0.1, 0.1}, num_thread);
+        out.GenerateTensorValue(GeneratorTensor_3<float>{-0.1, 0.1}, num_thread);
         break;
     default:
         in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
@@ -213,7 +213,6 @@ int main(int argc, char* argv[])
     }
 
     auto f_make_for_device_nchw = [&]() {
-#if USE_MODE
         const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
         const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
         const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
@@ -221,19 +220,6 @@ int main(int argc, char* argv[])
         const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
         const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
         const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
-#else
-        const auto in_lengths_dev =
-            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
-        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
-        const auto out_lengths_dev =
-            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
-        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
-        const auto conv_dilations_dev =
-            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
-        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
-        const auto in_right_pads_dev =
-            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
-#endif
 
         return make_tuple(in_lengths_dev,
                           wei_lengths_dev,
diff --git a/host/host_tensor/include/host_conv_wrw.hpp b/host/host_tensor/include/host_conv_bwd_weight.hpp
similarity index 100%
rename from host/host_tensor/include/host_conv_wrw.hpp
rename to host/host_tensor/include/host_conv_bwd_weight.hpp

From 7bc4254d76441faad71eddf9edb4d9012c31b1c2 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Tue, 31 Aug 2021 10:58:25 +0800
Subject: [PATCH 20/21] fix vector load issue

---
 ...rd_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index 445cf229817..b681c38eb61 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -64,9 +64,9 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
 
     using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
     using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
-
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
-    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+    // using vector load 4, so config's wo*ho  must be a multiple of 4
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
 
     using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
     using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;

From fd30df2345f59c9bad176c83b3493d76cba1e18c Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Mon, 30 Aug 2021 22:45:10 -0500
Subject: [PATCH 21/21] tweak

---
 ...icit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp | 28 +++++++++++++++++++
 .../src/conv_wrw_driver_offline.cpp           |  4 +--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
index b681c38eb61..e97bc9c1c7f 100644
--- a/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -74,6 +74,34 @@ void device_convolution_backward_weight_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
     constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
     constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
 
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+    // using vector load 4, so config's wo*ho  must be a multiple of 4
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
     constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
 #endif
 
diff --git a/host/driver_offline/src/conv_wrw_driver_offline.cpp b/host/driver_offline/src/conv_wrw_driver_offline.cpp
index b83fb9f7543..13c73abf30f 100644
--- a/host/driver_offline/src/conv_wrw_driver_offline.cpp
+++ b/host/driver_offline/src/conv_wrw_driver_offline.cpp
@@ -95,8 +95,8 @@ int main(int argc, char* argv[])
     constexpr auto Y  = Number<3>{};
     constexpr auto X  = Number<3>{};
 
-    constexpr auto conv_stride_h   = I2;
-    constexpr auto conv_stride_w   = I2;
+    constexpr auto conv_stride_h   = I1;
+    constexpr auto conv_stride_w   = I1;
     constexpr auto conv_dilation_h = I1;
     constexpr auto conv_dilation_w = I1;
     constexpr auto in_left_pad_h   = I1;