From 9dae287ec578d2ca73bd82ffc24f64783c536938 Mon Sep 17 00:00:00 2001
From: ltqin <letaoqin@amd.com>
Date: Fri, 28 Jan 2022 20:53:45 +0800
Subject: [PATCH 1/4] add reference

---
 .../conv2d_fwd_xdl_bias_relu.cpp              |  31 ++-
 .../conv2d_fwd_xdl_bias_relu_add.cpp          |  33 +--
 example/CMakeLists.txt                        |   1 +
 ...fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp | 190 ++++++++++++++++++
 ...v2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp | 179 +++++++++++++++++
 5 files changed, 411 insertions(+), 23 deletions(-)
 create mode 100644 host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp
 create mode 100644 host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp

diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index aa2605bbdff..7f90cbffeb0 100644
--- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -13,6 +13,7 @@
 #include "tensor_layout.hpp"
 #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -44,6 +45,10 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
 //      |          |            |            |            |   Operation|   Operation|     Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
 //      |          |            |            |            |            |            |              |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
         <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+
+using ReferenceConvFwdInstance = ck::tensor_operation::host::
+    ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+        < InDataType,  WeiDataType,  OutDataType,  AccDataType, InElementOp, WeiElementOp, OutElementOp>;
 // clang-format on
 
 template <typename TIn,
@@ -277,17 +282,21 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                bias_k,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);
 
         out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
 
diff --git a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index 1353b65248f..328be6660fb 100644
--- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -13,6 +13,7 @@
 #include "tensor_layout.hpp"
 #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -41,6 +42,10 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
 //      |          |            |            |            |   Operation|   Operation|     Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
 //      |          |            |            |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
         <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+
+using ReferenceConvFwdInstance = ck::tensor_operation::host::
+    ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+        <InDataType, WeiDataType,  OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>;
 // clang-format on
 
 template <typename TIn,
@@ -292,18 +297,22 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   resi_n_k_ho_wo,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                bias_k,
+                                                resi_n_k_ho_wo,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);
 
         out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
 
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 6f231bcdf03..c25e78bf295 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}
     ${PROJECT_SOURCE_DIR}/host/host_tensor/include
     ${PROJECT_SOURCE_DIR}/host/device/include
+    ${PROJECT_SOURCE_DIR}/host/include
     ${PROJECT_SOURCE_DIR}/device_operation/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
diff --git a/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000000..3ffe369cc40
--- /dev/null
+++ b/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,190 @@
+#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
+#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 const Tensor<OutDataType>& resi_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              resi_n_k_ho_wo_{resi_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+        const Tensor<OutDataType>& resi_n_k_ho_wo_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument =
+            ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::
+                Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                float v2 = ck::type_convert<float>(arg.out_n_k_ho_wo_(n, k, ho, wo));
+
+                arg.out_element_op_(v2,
+                                    v,
+                                    ck::type_convert<float>(arg.bias_k_(k)),
+                                    ck::type_convert<float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert<OutDataType>(v2);
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             const Tensor<OutDataType>& resi_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        resi_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000000..1daaa54b095
--- /dev/null
+++ b/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,179 @@
+#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP
+#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K])
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument =
+            ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::
+                Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                    ck::type_convert<OutDataType>(arg.out_element_op_(v, arg.bias_k_(k)));
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif

From ee2bb8c1d98616db0943cdaf62fc2affb22bfcdf Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 4 Feb 2022 02:17:13 +0000
Subject: [PATCH 2/4] clean up

---
 .../conv2d_fwd_xdl_bias_relu.cpp              | 49 ----------------
 .../conv2d_fwd_xdl_bias_relu_add.cpp          | 57 -------------------
 2 files changed, 106 deletions(-)

diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 7f90cbffeb0..897b213a547 100644
--- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -51,55 +51,6 @@ using ReferenceConvFwdInstance = ck::tensor_operation::host::
         < InDataType,  WeiDataType,  OutDataType,  AccDataType, InElementOp, WeiElementOp, OutElementOp>;
 // clang-format on
 
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
-                    }
-                }
-            }
-        }
-
-        out_n_k_ho_wo(n, k, ho, wo) = out_element_op(v, bias_k(k));
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
-
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
diff --git a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index 328be6660fb..6ade3c3d202 100644
--- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -48,63 +48,6 @@ using ReferenceConvFwdInstance = ck::tensor_operation::host::
         <InDataType, WeiDataType,  OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>;
 // clang-format on
 
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const Tensor<TOut>& resi_n_k_ho_wo,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
-                    }
-                }
-            }
-        }
-
-        double v2 = out_n_k_ho_wo(n, k, ho, wo);
-
-        out_element_op(v2,
-                       v,
-                       static_cast<const double>(bias_k(k)),
-                       static_cast<const double>(resi_n_k_ho_wo(n, k, ho, wo)));
-
-        out_n_k_ho_wo(n, k, ho, wo) = v2;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
-
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;

From 701c434d189ecb1a352896b12ac94c3c6a7f852d Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 4 Feb 2022 03:20:42 +0000
Subject: [PATCH 3/4] add reference for conv

---
 example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp   |  84 +++------
 .../reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp   | 172 ++++++++++++++++++
 2 files changed, 197 insertions(+), 59 deletions(-)
 create mode 100644 host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp

diff --git a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
index ad428e2ef23..8c52bdaafdb 100644
--- a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
+++ b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
@@ -11,8 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -43,55 +44,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device::
         <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
 // clang-format on
 
-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_verify(const Tensor<TIn>& in,
-                 const Tensor<TWei>& wei,
-                 Tensor<TOut>& out,
-                 const std::vector<ck::index_t>& conv_strides,
-                 const std::vector<ck::index_t>& conv_dilations,
-                 const std::vector<ck::index_t>& in_left_pads,
-                 const std::vector<ck::index_t>&,
-                 const InElementOp& in_element_op,
-                 const WeiElementOp& wei_element_op,
-                 const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei(k, c, y, x)));
-                    }
-                }
-            }
-        }
-        double v2 = out(n, k, ho, wo);
-
-        out_element_op(v2, v);
+using ReferenceConvFwdInstance =
+    ck::tensor_operation::host::ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        AccDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp>;
 
-        out(n, k, ho, wo) = v2;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out.mDesc.GetLengths()[0],
-                               out.mDesc.GetLengths()[1],
-                               out.mDesc.GetLengths()[2],
-                               out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-}
 
 int main(int argc, char* argv[])
 {
@@ -265,16 +227,20 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-        host_verify(in_n_c_hi_wi,
-                    wei_k_c_y_x,
-                    out_n_k_ho_wo_host_result,
-                    conv_filter_strides,
-                    conv_filter_dilations,
-                    input_left_pads,
-                    input_right_pads,
-                    InElementOp{},
-                    WeiElementOp{},
-                    OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);
 
         out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
 
diff --git a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000000..3c7614dbd67
--- /dev/null
+++ b/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,172 @@
+#ifndef REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP
+#define REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "common_header.hpp"
+//#include "tensor_descriptor.hpp"
+//#include "tensor_descriptor_helper.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument =
+            ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                    ck::type_convert<OutDataType>(arg.out_element_op_(v));
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif

From ad0d5b22350431a09dc10ea0b165474dfef107a5 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 4 Feb 2022 04:26:37 +0000
Subject: [PATCH 4/4] rename

---
 example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp   | 63 +++++++++++++------
 .../conv2d_fwd_xdl_bias_relu.cpp              | 61 ++++++++++++++----
 .../conv2d_fwd_xdl_bias_relu_add.cpp          | 59 +++++++++++++----
 ...c_kyxc_nhwk.hpp => reference_conv_fwd.hpp} | 18 ++----
 ...=> reference_conv_fwd_bias_activation.hpp} | 17 ++---
 ...eference_conv_fwd_bias_activation_add.hpp} | 17 ++---
 6 files changed, 156 insertions(+), 79 deletions(-)
 rename host/include/{reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp => reference_conv_fwd.hpp} (91%)
 rename host/include/{reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp => reference_conv_fwd_bias_activation.hpp} (91%)
 rename host/include/{reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp => reference_conv_fwd_bias_activation_add.hpp} (91%)

diff --git a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
index 8c52bdaafdb..310de70b25f 100644
--- a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
+++ b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
@@ -13,7 +13,7 @@
 #include "tensor_layout.hpp"
 #include "element_wise_operation.hpp"
 #include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
-#include "reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -34,26 +34,53 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
     ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
 
+// clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+    DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                       // InDataType
+        WeiDataType,                      // WeiDataType
+        OutDataType,                      // OutDataType
+        AccDataType,                      // AccDataType
+        InElementOp,                      // InElementwiseOperation
+        WeiElementOp,                     // WeiElementwiseOperation
+        OutElementOp,                     // OutElementwiseOperation
+        ConvFwdDefault,                   // ConvForwardSpecialization
+        256,                              // BlockSize
+        128,                              // MPerBlock
+        256,                              // NPerBlock
+        4,                                // K0PerBlock
+        8,                                // K1
+        32,                               // MPerXdl
+        32,                               // NPerXdl
+        2,                                // MXdlPerWave
+        4,                                // NXdlPerWave
+        S<4, 64, 1>,                      // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,                       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                       // ABlockTransferSrcAccessOrder
+        2,                                // ABlockTransferSrcVectorDim
+        8,                                // ABlockTransferSrcScalarPerVector
+        8,                                // ABlockTransferDstScalarPerVector_K1
+        true,                             // ABlockLdsAddExtraM
+        S<4, 64, 1>,                      // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,                       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                       // BBlockTransferSrcAccessOrder
+        2,                                // BBlockTransferSrcVectorDim
+        8,                                // BBlockTransferSrcScalarPerVector
+        8,                                // BBlockTransferDstScalarPerVector_K1
+        true,                             // BBlockLdsAddExtraN
+        1,                                // CShuffleMXdlPerWavePerShuffle
+        1,                                // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,             // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-using ReferenceConvFwdInstance =
-    ck::tensor_operation::host::ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
-        InDataType,
-        WeiDataType,
-        OutDataType,
-        AccDataType,
-        InElementOp,
-        WeiElementOp,
-        OutElementOp>;
-
+using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                              WeiDataType,
+                                                                              OutDataType,
+                                                                              AccDataType,
+                                                                              InElementOp,
+                                                                              WeiElementOp,
+                                                                              OutElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
index 897b213a547..79bd332709e 100644
--- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -11,9 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
-#include "reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd_bias_activation.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -38,19 +38,54 @@ static constexpr auto ConvFwdDefault =
 
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
-
-using ReferenceConvFwdInstance = ck::tensor_operation::host::
-    ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-        < InDataType,  WeiDataType,  OutDataType,  AccDataType, InElementOp, WeiElementOp, OutElementOp>;
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                   // InDataType
+        WeiDataType,                  // WeiDataType
+        OutDataType,                  // OutDataType
+        AccDataType,                  // AccDataType
+        InElementOp,                  // InElementwiseOperation
+        WeiElementOp,                 // WeiElementwiseOperation
+        OutElementOp,                 // OutElementwiseOperation
+        MemorySet,                    // OutGlobalMemoryDataOperation
+        ConvFwdDefault,               // ConvForwardSpecialization
+        256,                          // BlockSize
+        128,                          // MPerBlock
+        256,                          // NPerBlock
+        4,                            // K0PerBlock
+        8,                            // K1
+        32,                           // MPerXdl
+        32,                           // NPerXdl
+        2,                            // MXdlPerWave
+        4,                            // NXdlPerWave
+        S<4, 64, 1>,                  // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,                   // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                   // ABlockTransferSrcAccessOrder
+        2,                            // ABlockTransferSrcVectorDim
+        8,                            // ABlockTransferSrcScalarPerVector
+        8,                            // ABlockTransferDstScalarPerVector_K1
+        true,                         // ABlockLdsAddExtraM
+        S<4, 64, 1>,                  // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,                   // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                   // BBlockTransferSrcAccessOrder
+        2,                            // BBlockTransferSrcVectorDim
+        8,                            // BBlockTransferSrcScalarPerVector
+        8,                            // BBlockTransferDstScalarPerVector_K1
+        true,                         // BBlockLdsAddExtraN
+        1,                            // CShuffleMXdlPerWavePerShuffle
+        1,                            // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,         // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                           // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
+using ReferenceConvFwdInstance =
+    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp>;
+
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
diff --git a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
index 6ade3c3d202..2b1414b05b6 100644
--- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -11,9 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
-#include "reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd_bias_activation_add.hpp"
 
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -36,18 +36,53 @@ static constexpr auto ConvFwdDefault =
 
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-   DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K 
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
-
-using ReferenceConvFwdInstance = ck::tensor_operation::host::
-    ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-        <InDataType, WeiDataType,  OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>;
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,              // InDataType
+        WeiDataType,             // WeiDataType
+        OutDataType,             // OutDataType
+        AccDataType,             // AccDataType
+        InElementOp,             // InElementwiseOperation
+        WeiElementOp,            // WeiElementwiseOperation
+        OutElementOp,            // OutElementwiseOperation
+        ConvFwdDefault,          // ConvForwardSpecialization
+        256,                     // BlockSize
+        128,                     // MPerBlock
+        256,                     // NPerBlock
+        4,                       // K0PerBlock
+        8,                       // K1
+        32,                      // MPerXdl
+        32,                      // NPerXdl
+        2,                       // MXdlPerWave
+        4,                       // NXdlPerWave
+        S<4, 64, 1>,             // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,              // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,              // ABlockTransferSrcAccessOrder
+        2,                       // ABlockTransferSrcVectorDim
+        8,                       // ABlockTransferSrcScalarPerVector
+        8,                       // ABlockTransferDstScalarPerVector_K1
+        true,                    // ABlockLdsAddExtraM
+        S<4, 64, 1>,             // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,              // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,              // BBlockTransferSrcAccessOrder
+        2,                       // BBlockTransferSrcVectorDim
+        8,                       // BBlockTransferSrcScalarPerVector
+        8,                       // BBlockTransferDstScalarPerVector_K1
+        true,                    // BBlockLdsAddExtraN
+        1,                       // CShuffleMXdlPerWavePerShuffle
+        1,                       // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,    // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                      // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
+using ReferenceConvFwdInstance =
+    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     AccDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>;
+
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
diff --git a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd.hpp
similarity index 91%
rename from host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp
rename to host/include/reference_conv_fwd.hpp
index 3c7614dbd67..a92ed95b3c5 100644
--- a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp
+++ b/host/include/reference_conv_fwd.hpp
@@ -1,20 +1,16 @@
-#ifndef REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP
-#define REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP
+#ifndef REFERENCE_CONV_FWD_HPP
+#define REFERENCE_CONV_FWD_HPP
 
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
 #include "device_base.hpp"
-#include "common_header.hpp"
-//#include "tensor_descriptor.hpp"
-//#include "tensor_descriptor_helper.hpp"
 #include "host_tensor.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace host {
 
-// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
 template <typename InDataType,
           typename WeiDataType,
           typename OutDataType,
@@ -22,8 +18,7 @@ template <typename InDataType,
           typename InElementwiseOperation,
           typename WeiElementwiseOperation,
           typename OutElementwiseOperation>
-struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    : public device::BaseOperator
+struct ReferenceConvFwd : public device::BaseOperator
 {
     // Argument
     struct Argument : public device::BaseArgument
@@ -68,8 +63,7 @@ struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
     // Invoker
     struct Invoker : public device::BaseInvoker
     {
-        using Argument =
-            ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::Argument;
+        using Argument = ReferenceConvFwd::Argument;
 
         float Run(const Argument& arg)
         {
@@ -159,7 +153,7 @@ struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
         auto str = std::stringstream();
 
         // clang-format off
-        str << "ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "ReferenceConvFwd"
             << std::endl;
         // clang-format on
 
diff --git a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd_bias_activation.hpp
similarity index 91%
rename from host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp
rename to host/include/reference_conv_fwd_bias_activation.hpp
index 1daaa54b095..d65bba1a880 100644
--- a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/host/include/reference_conv_fwd_bias_activation.hpp
@@ -1,13 +1,9 @@
-#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP
-#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP
+#ifndef REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
+#define REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
 
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
 #include "device_base.hpp"
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
 #include "host_tensor.hpp"
 
 namespace ck {
@@ -23,8 +19,7 @@ template <typename InDataType,
           typename InElementwiseOperation,
           typename WeiElementwiseOperation,
           typename OutElementwiseOperation>
-struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    : public device::BaseOperator
+struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
 {
     // Argument
     struct Argument : public device::BaseArgument
@@ -72,9 +67,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
     // Invoker
     struct Invoker : public device::BaseInvoker
     {
-        using Argument =
-            ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::
-                Argument;
+        using Argument = ReferenceConvFwd_Bias_Activation::Argument;
 
         float Run(const Argument& arg)
         {
@@ -166,7 +159,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
         auto str = std::stringstream();
 
         // clang-format off
-        str << "ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "ReferenceConvFwd_Bias_Activation"
             << std::endl;
         // clang-format on
 
diff --git a/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd_bias_activation_add.hpp
similarity index 91%
rename from host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp
rename to host/include/reference_conv_fwd_bias_activation_add.hpp
index 3ffe369cc40..eb4b708c12a 100644
--- a/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/host/include/reference_conv_fwd_bias_activation_add.hpp
@@ -1,13 +1,9 @@
-#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
-#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP
+#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
+#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
 
 #include <iostream>
 #include <sstream>
-#include "device.hpp"
 #include "device_base.hpp"
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
 #include "host_tensor.hpp"
 
 namespace ck {
@@ -23,8 +19,7 @@ template <typename InDataType,
           typename InElementwiseOperation,
           typename WeiElementwiseOperation,
           typename OutElementwiseOperation>
-struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    : public device::BaseOperator
+struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
 {
     // Argument
     struct Argument : public device::BaseArgument
@@ -75,9 +70,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Out
     // Invoker
     struct Invoker : public device::BaseInvoker
     {
-        using Argument =
-            ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::
-                Argument;
+        using Argument = ReferenceConvFwd_Bias_Activation_Add::Argument;
 
         float Run(const Argument& arg)
         {
@@ -177,7 +170,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Out
         auto str = std::stringstream();
 
         // clang-format off
-        str << "ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+        str << "ReferenceConvFwd_Bias_Activation_Add"
             << std::endl;
         // clang-format on