From 9dae287ec578d2ca73bd82ffc24f64783c536938 Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 28 Jan 2022 20:53:45 +0800 Subject: [PATCH 1/4] add reference --- .../conv2d_fwd_xdl_bias_relu.cpp | 31 ++- .../conv2d_fwd_xdl_bias_relu_add.cpp | 33 +-- example/CMakeLists.txt | 1 + ...fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp | 190 ++++++++++++++++++ ...v2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp | 179 +++++++++++++++++ 5 files changed, 411 insertions(+), 23 deletions(-) create mode 100644 host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp create mode 100644 host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index aa2605bbdff..7f90cbffeb0 100644 --- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -13,6 +13,7 @@ #include "tensor_layout.hpp" #include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" #include "element_wise_operation.hpp" +#include "reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -44,6 +45,10 @@ using DeviceConvFwdInstance = ck::tensor_operation::device:: // | | | | | Operation| Operation| Operation| DataOperation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; + +using ReferenceConvFwdInstance = ck::tensor_operation::host:: + ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K + < InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>; // clang-format on template , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; + +using ReferenceConvFwdInstance = ck::tensor_operation::host:: + ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K + ; // clang-format on template +#include +#include "device.hpp" +#include "device_base.hpp" +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "host_tensor.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +// out[N, Ho, Wo, K] = +// activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K] +template +struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K + : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const Tensor& bias_k, + const Tensor& resi_n_k_ho_wo, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + : in_n_c_hi_wi_{in_n_c_hi_wi}, + wei_k_c_y_x_{wei_k_c_y_x}, + out_n_k_ho_wo_{out_n_k_ho_wo}, + bias_k_{bias_k}, + resi_n_k_ho_wo_{resi_n_k_ho_wo}, + conv_strides_{conv_filter_strides}, + conv_dilations_{conv_filter_dilations}, + in_left_pads_{input_left_pads}, + in_right_pads_{input_right_pads}, + in_element_op_{in_element_op}, + wei_element_op_{wei_element_op}, + out_element_op_{out_element_op} + { + } + + const Tensor& in_n_c_hi_wi_; + const Tensor& wei_k_c_y_x_; + Tensor& out_n_k_ho_wo_; + const Tensor& bias_k_; + const Tensor& resi_n_k_ho_wo_; + + std::vector conv_strides_; + std::vector conv_dilations_; + std::vector in_left_pads_; + std::vector in_right_pads_; + + InElementwiseOperation in_element_op_; + WeiElementwiseOperation wei_element_op_; + OutElementwiseOperation out_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = + ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K:: + Argument; + + float Run(const Argument& arg) + { + auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { + float v = 0; + for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) + { + for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - + arg.in_left_pads_[0]; + for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - + arg.in_left_pads_[1]; + if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && + wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + { + v += arg.in_element_op_( + ck::type_convert(arg.in_n_c_hi_wi_(n, c, hi, wi))) * + arg.wei_element_op_( + ck::type_convert(arg.wei_k_c_y_x_(k, c, y, x))); + } + } + } + } + + float v2 = ck::type_convert(arg.out_n_k_ho_wo_(n, k, ho, wo)); + + arg.out_element_op_(v2, + v, + ck::type_convert(arg.bias_k_(k)), + ck::type_convert(arg.resi_n_k_ho_wo_(n, k, ho, wo))); + + arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert(v2); + }; + + make_ParallelTensorFunctor(f_nchw, + arg.out_n_k_ho_wo_.mDesc.GetLengths()[0], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[1], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[2], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])( + std::thread::hardware_concurrency()); + return 0; + } + + float Run(const device::BaseArgument* p_arg, int) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const Tensor& bias_k, + const Tensor& resi_n_k_ho_wo, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + { + return Argument{in_n_c_hi_wi, + wei_k_c_y_x, + out_n_k_ho_wo, + bias_k, + resi_n_k_ho_wo, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + << std::endl; + // clang-format on + + return str.str(); + } +}; +} // namespace host +} // namespace tensor_operation +} // namespace ck +#endif diff --git a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp new file mode 100644 index 00000000000..1daaa54b095 --- /dev/null +++ b/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp @@ -0,0 +1,179 @@ +#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP +#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP + +#include +#include +#include "device.hpp" +#include "device_base.hpp" +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "host_tensor.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +// out[N, Ho, Wo, K] = +// activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) +template +struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K + : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const Tensor& bias_k, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + : in_n_c_hi_wi_{in_n_c_hi_wi}, + wei_k_c_y_x_{wei_k_c_y_x}, + out_n_k_ho_wo_{out_n_k_ho_wo}, + bias_k_{bias_k}, + conv_strides_{conv_filter_strides}, + conv_dilations_{conv_filter_dilations}, + in_left_pads_{input_left_pads}, + in_right_pads_{input_right_pads}, + in_element_op_{in_element_op}, + wei_element_op_{wei_element_op}, + out_element_op_{out_element_op} + { + } + + const Tensor& in_n_c_hi_wi_; + const Tensor& wei_k_c_y_x_; + Tensor& out_n_k_ho_wo_; + const Tensor& bias_k_; + + std::vector conv_strides_; + std::vector conv_dilations_; + std::vector in_left_pads_; + std::vector in_right_pads_; + + InElementwiseOperation in_element_op_; + WeiElementwiseOperation wei_element_op_; + OutElementwiseOperation out_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = + ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K:: + Argument; + + float Run(const Argument& arg) + { + auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { + float v = 0; + for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) + { + for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - + arg.in_left_pads_[0]; + for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - + arg.in_left_pads_[1]; + if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && + wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + { + v += arg.in_element_op_( + ck::type_convert(arg.in_n_c_hi_wi_(n, c, hi, wi))) * + arg.wei_element_op_( + ck::type_convert(arg.wei_k_c_y_x_(k, c, y, x))); + } + } + } + } + + arg.out_n_k_ho_wo_(n, k, ho, wo) = + ck::type_convert(arg.out_element_op_(v, arg.bias_k_(k))); + }; + + make_ParallelTensorFunctor(f_nchw, + arg.out_n_k_ho_wo_.mDesc.GetLengths()[0], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[1], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[2], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])( + std::thread::hardware_concurrency()); + return 0; + } + + float Run(const device::BaseArgument* p_arg, int) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + const Tensor& bias_k, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + { + return Argument{in_n_c_hi_wi, + wei_k_c_y_x, + out_n_k_ho_wo, + bias_k, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + << std::endl; + // clang-format on + + return str.str(); + } +}; +} // namespace host +} // namespace tensor_operation +} // namespace ck +#endif From ee2bb8c1d98616db0943cdaf62fc2affb22bfcdf Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Feb 2022 02:17:13 +0000 Subject: [PATCH 2/4] clean up --- .../conv2d_fwd_xdl_bias_relu.cpp | 49 ---------------- .../conv2d_fwd_xdl_bias_relu_add.cpp | 57 ------------------- 2 files changed, 106 deletions(-) diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index 7f90cbffeb0..897b213a547 100644 --- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -51,55 +51,6 @@ using ReferenceConvFwdInstance = ck::tensor_operation::host:: < InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>; // clang-format on -template -void host_reference_calculation(const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const Tensor& bias_k, - const std::vector& conv_strides, - const std::vector& conv_dilations, - const std::vector& in_left_pads, - const std::vector& /* in_right_pads */, - const InElementOp& in_element_op, - const WeiElementOp& wei_element_op, - const OutElementOp& out_element_op) -{ - auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { - double v = 0; - for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c) - { - for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y) - { - int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0]; - for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x) - { - int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1]; - if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 && - wi < in_n_c_hi_wi.mDesc.GetLengths()[3]) - { - v += in_element_op(static_cast(in_n_c_hi_wi(n, c, hi, wi))) * - wei_element_op(static_cast(wei_k_c_y_x(k, c, y, x))); - } - } - } - } - - out_n_k_ho_wo(n, k, ho, wo) = out_element_op(v, bias_k(k)); - }; - - make_ParallelTensorFunctor(f_nchw, - out_n_k_ho_wo.mDesc.GetLengths()[0], - out_n_k_ho_wo.mDesc.GetLengths()[1], - out_n_k_ho_wo.mDesc.GetLengths()[2], - out_n_k_ho_wo.mDesc.GetLengths()[3])( - std::thread::hardware_concurrency()); -} - int main(int argc, char* argv[]) { bool do_verification = 0; diff --git a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index 328be6660fb..6ade3c3d202 100644 --- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -48,63 +48,6 @@ using ReferenceConvFwdInstance = ck::tensor_operation::host:: ; // clang-format on -template -void host_reference_calculation(const Tensor& in_n_c_hi_wi, - const Tensor& wei_k_c_y_x, - Tensor& out_n_k_ho_wo, - const Tensor& bias_k, - const Tensor& resi_n_k_ho_wo, - const std::vector& conv_strides, - const std::vector& conv_dilations, - const std::vector& in_left_pads, - const std::vector& /* in_right_pads */, - const InElementOp& in_element_op, - const WeiElementOp& wei_element_op, - const OutElementOp& out_element_op) -{ - auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { - double v = 0; - for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c) - { - for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y) - { - int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0]; - for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x) - { - int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1]; - if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 && - wi < in_n_c_hi_wi.mDesc.GetLengths()[3]) - { - v += in_element_op(static_cast(in_n_c_hi_wi(n, c, hi, wi))) * - wei_element_op(static_cast(wei_k_c_y_x(k, c, y, x))); - } - } - } - } - - double v2 = out_n_k_ho_wo(n, k, ho, wo); - - out_element_op(v2, - v, - static_cast(bias_k(k)), - static_cast(resi_n_k_ho_wo(n, k, ho, wo))); - - out_n_k_ho_wo(n, k, ho, wo) = v2; - }; - - make_ParallelTensorFunctor(f_nchw, - out_n_k_ho_wo.mDesc.GetLengths()[0], - out_n_k_ho_wo.mDesc.GetLengths()[1], - out_n_k_ho_wo.mDesc.GetLengths()[2], - out_n_k_ho_wo.mDesc.GetLengths()[3])( - std::thread::hardware_concurrency()); -} - int main(int argc, char* argv[]) { bool do_verification = 0; From 701c434d189ecb1a352896b12ac94c3c6a7f852d Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Feb 2022 03:20:42 +0000 Subject: [PATCH 3/4] add reference for conv --- example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp | 84 +++------ .../reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp | 172 ++++++++++++++++++ 2 files changed, 197 insertions(+), 59 deletions(-) create mode 100644 host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp diff --git a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp index ad428e2ef23..8c52bdaafdb 100644 --- a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp +++ b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp @@ -11,8 +11,9 @@ #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "tensor_layout.hpp" -#include "device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" #include "element_wise_operation.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" +#include "reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -43,55 +44,16 @@ using DeviceConvFwdInstance = ck::tensor_operation::device:: , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; // clang-format on -template -void host_verify(const Tensor& in, - const Tensor& wei, - Tensor& out, - const std::vector& conv_strides, - const std::vector& conv_dilations, - const std::vector& in_left_pads, - const std::vector&, - const InElementOp& in_element_op, - const WeiElementOp& wei_element_op, - const OutElementOp& out_element_op) -{ - auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { - double v = 0; - for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c) - { - for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) - { - int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0]; - for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) - { - int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1]; - if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && - wi < in.mDesc.GetLengths()[3]) - { - v += in_element_op(static_cast(in(n, c, hi, wi))) * - wei_element_op(static_cast(wei(k, c, y, x))); - } - } - } - } - double v2 = out(n, k, ho, wo); - - out_element_op(v2, v); +using ReferenceConvFwdInstance = + ck::tensor_operation::host::ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + InDataType, + WeiDataType, + OutDataType, + AccDataType, + InElementOp, + WeiElementOp, + OutElementOp>; - out(n, k, ho, wo) = v2; - }; - - make_ParallelTensorFunctor(f_nchw, - out.mDesc.GetLengths()[0], - out.mDesc.GetLengths()[1], - out.mDesc.GetLengths()[2], - out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); -} int main(int argc, char* argv[]) { @@ -265,16 +227,20 @@ int main(int argc, char* argv[]) if(do_verification) { - host_verify(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - InElementOp{}, - WeiElementOp{}, - OutElementOp{}); + auto refConv = ReferenceConvFwdInstance{}; + auto refInvoker = refConv.MakeInvoker(); + + auto refArgument = refConv.MakeArgument(in_n_c_hi_wi, + wei_k_c_y_x, + out_n_k_ho_wo_host_result, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + refInvoker.Run(refArgument); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); diff --git a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp new file mode 100644 index 00000000000..3c7614dbd67 --- /dev/null +++ b/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp @@ -0,0 +1,172 @@ +#ifndef REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP +#define REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP + +#include +#include +#include "device.hpp" +#include "device_base.hpp" +#include "common_header.hpp" +//#include "tensor_descriptor.hpp" +//#include "tensor_descriptor_helper.hpp" +#include "host_tensor.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] +template +struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K + : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + : in_n_c_hi_wi_{in_n_c_hi_wi}, + wei_k_c_y_x_{wei_k_c_y_x}, + out_n_k_ho_wo_{out_n_k_ho_wo}, + conv_strides_{conv_filter_strides}, + conv_dilations_{conv_filter_dilations}, + in_left_pads_{input_left_pads}, + in_right_pads_{input_right_pads}, + in_element_op_{in_element_op}, + wei_element_op_{wei_element_op}, + out_element_op_{out_element_op} + { + } + + const Tensor& in_n_c_hi_wi_; + const Tensor& wei_k_c_y_x_; + Tensor& out_n_k_ho_wo_; + + std::vector conv_strides_; + std::vector conv_dilations_; + std::vector in_left_pads_; + std::vector in_right_pads_; + + InElementwiseOperation in_element_op_; + WeiElementwiseOperation wei_element_op_; + OutElementwiseOperation out_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = + ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::Argument; + + float Run(const Argument& arg) + { + auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { + float v = 0; + for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) + { + for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y) + { + int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - + arg.in_left_pads_[0]; + for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x) + { + int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - + arg.in_left_pads_[1]; + if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && + wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3]) + { + v += arg.in_element_op_( + ck::type_convert(arg.in_n_c_hi_wi_(n, c, hi, wi))) * + arg.wei_element_op_( + ck::type_convert(arg.wei_k_c_y_x_(k, c, y, x))); + } + } + } + } + + arg.out_n_k_ho_wo_(n, k, ho, wo) = + ck::type_convert(arg.out_element_op_(v)); + }; + + make_ParallelTensorFunctor(f_nchw, + arg.out_n_k_ho_wo_.mDesc.GetLengths()[0], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[1], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[2], + arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])( + std::thread::hardware_concurrency()); + return 0; + } + + float Run(const device::BaseArgument* p_arg, int) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& in_n_c_hi_wi, + const Tensor& wei_k_c_y_x, + Tensor& out_n_k_ho_wo, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads, + InElementwiseOperation in_element_op, + WeiElementwiseOperation wei_element_op, + OutElementwiseOperation out_element_op) + { + return Argument{in_n_c_hi_wi, + wei_k_c_y_x, + out_n_k_ho_wo, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + << std::endl; + // clang-format on + + return str.str(); + } +}; +} // namespace host +} // namespace tensor_operation +} // namespace ck +#endif From ad0d5b22350431a09dc10ea0b165474dfef107a5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Feb 2022 04:26:37 +0000 Subject: [PATCH 4/4] rename --- example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp | 63 +++++++++++++------ .../conv2d_fwd_xdl_bias_relu.cpp | 61 ++++++++++++++---- .../conv2d_fwd_xdl_bias_relu_add.cpp | 59 +++++++++++++---- ...c_kyxc_nhwk.hpp => reference_conv_fwd.hpp} | 18 ++---- ...=> reference_conv_fwd_bias_activation.hpp} | 17 ++--- ...eference_conv_fwd_bias_activation_add.hpp} | 17 ++--- 6 files changed, 156 insertions(+), 79 deletions(-) rename host/include/{reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp => reference_conv_fwd.hpp} (91%) rename host/include/{reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp => reference_conv_fwd_bias_activation.hpp} (91%) rename host/include/{reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp => reference_conv_fwd_bias_activation_add.hpp} (91%) diff --git a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp index 8c52bdaafdb..310de70b25f 100644 --- a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp +++ b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp @@ -13,7 +13,7 @@ #include "tensor_layout.hpp" #include "element_wise_operation.hpp" #include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp" -#include "reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp" +#include "reference_conv_fwd.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -34,26 +34,53 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; +// clang-format off using DeviceConvFwdInstance = ck::tensor_operation::device:: - DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - // clang-format off -// | InData| WeiData| OutData| AccData| In| Wei| Out| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| -// | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| -// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; + DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + InDataType, // InDataType + WeiDataType, // WeiDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + InElementOp, // InElementwiseOperation + WeiElementOp, // WeiElementwiseOperation + OutElementOp, // OutElementwiseOperation + ConvFwdDefault, // ConvForwardSpecialization + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -using ReferenceConvFwdInstance = - ck::tensor_operation::host::ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< - InDataType, - WeiDataType, - OutDataType, - AccDataType, - InElementOp, - WeiElementOp, - OutElementOp>; - +using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; int main(int argc, char* argv[]) { diff --git a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp index 897b213a547..79bd332709e 100644 --- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp +++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp @@ -11,9 +11,9 @@ #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "tensor_layout.hpp" -#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" #include "element_wise_operation.hpp" -#include "reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp" +#include "reference_conv_fwd_bias_activation.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -38,19 +38,54 @@ static constexpr auto ConvFwdDefault = // clang-format off using DeviceConvFwdInstance = ck::tensor_operation::device:: - DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - // clang-format off -// | InData| WeiData| OutData| AccData| In| Wei| Out| Out| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| GlobalMemory| Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| -// | | | | | Operation| Operation| Operation| DataOperation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| -// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; - -using ReferenceConvFwdInstance = ck::tensor_operation::host:: - ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - < InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp>; + DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + InDataType, // InDataType + WeiDataType, // WeiDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + InElementOp, // InElementwiseOperation + WeiElementOp, // WeiElementwiseOperation + OutElementOp, // OutElementwiseOperation + MemorySet, // OutGlobalMemoryDataOperation + ConvFwdDefault, // ConvForwardSpecialization + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on +using ReferenceConvFwdInstance = + ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation; + int main(int argc, char* argv[]) { bool do_verification = 0; diff --git a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp index 6ade3c3d202..2b1414b05b6 100644 --- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp +++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp @@ -11,9 +11,9 @@ #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "tensor_layout.hpp" -#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" #include "element_wise_operation.hpp" -#include "reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp" +#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp" +#include "reference_conv_fwd_bias_activation_add.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; @@ -36,18 +36,53 @@ static constexpr auto ConvFwdDefault = // clang-format off using DeviceConvFwdInstance = ck::tensor_operation::device:: - DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K -// | InData| WeiData| OutData| AccData| In| Wei| Out| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| -// | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| -// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - , S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 1, 32, 1, 1, 8>, 8>; - -using ReferenceConvFwdInstance = ck::tensor_operation::host:: - ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - ; + DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + InDataType, // InDataType + WeiDataType, // WeiDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + InElementOp, // InElementwiseOperation + WeiElementOp, // WeiElementwiseOperation + OutElementOp, // OutElementwiseOperation + ConvFwdDefault, // ConvForwardSpecialization + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 4, // K0PerBlock + 8, // K1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on +using ReferenceConvFwdInstance = + ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add; + int main(int argc, char* argv[]) { bool do_verification = 0; diff --git a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd.hpp similarity index 91% rename from host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp rename to host/include/reference_conv_fwd.hpp index 3c7614dbd67..a92ed95b3c5 100644 --- a/host/include/reference_conv2d_fwd_nhwc_kyxc_nhwk.hpp +++ b/host/include/reference_conv_fwd.hpp @@ -1,20 +1,16 @@ -#ifndef REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP -#define REFERENCE_CONV2D_FWD_NHWC_KYXC_NHWK_HPP +#ifndef REFERENCE_CONV_FWD_HPP +#define REFERENCE_CONV_FWD_HPP #include #include -#include "device.hpp" #include "device_base.hpp" -#include "common_header.hpp" -//#include "tensor_descriptor.hpp" -//#include "tensor_descriptor_helper.hpp" #include "host_tensor.hpp" namespace ck { namespace tensor_operation { namespace host { -// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] +// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X] template -struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - : public device::BaseOperator +struct ReferenceConvFwd : public device::BaseOperator { // Argument struct Argument : public device::BaseArgument @@ -68,8 +63,7 @@ struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K // Invoker struct Invoker : public device::BaseInvoker { - using Argument = - ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K::Argument; + using Argument = ReferenceConvFwd::Argument; float Run(const Argument& arg) { @@ -159,7 +153,7 @@ struct ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K auto str = std::stringstream(); // clang-format off - str << "ReferenceConv2dFwd_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + str << "ReferenceConvFwd" << std::endl; // clang-format on diff --git a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd_bias_activation.hpp similarity index 91% rename from host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp rename to host/include/reference_conv_fwd_bias_activation.hpp index 1daaa54b095..d65bba1a880 100644 --- a/host/include/reference_conv2d_fwd_bias_activation_nhwc_kyxc_nhwk.hpp +++ b/host/include/reference_conv_fwd_bias_activation.hpp @@ -1,13 +1,9 @@ -#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP -#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_NHWC_KYXC_NHWK_HPP +#ifndef REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP +#define REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP #include #include -#include "device.hpp" #include "device_base.hpp" -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" #include "host_tensor.hpp" namespace ck { @@ -23,8 +19,7 @@ template -struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - : public device::BaseOperator +struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator { // Argument struct Argument : public device::BaseArgument @@ -72,9 +67,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ // Invoker struct Invoker : public device::BaseInvoker { - using Argument = - ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K:: - Argument; + using Argument = ReferenceConvFwd_Bias_Activation::Argument; float Run(const Argument& arg) { @@ -166,7 +159,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ auto str = std::stringstream(); // clang-format off - str << "ReferenceConv2dFwd_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + str << "ReferenceConvFwd_Bias_Activation" << std::endl; // clang-format on diff --git a/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp b/host/include/reference_conv_fwd_bias_activation_add.hpp similarity index 91% rename from host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp rename to host/include/reference_conv_fwd_bias_activation_add.hpp index 3ffe369cc40..eb4b708c12a 100644 --- a/host/include/reference_conv2d_fwd_bias_activation_add_nhwc_kyxc_nhwk.hpp +++ b/host/include/reference_conv_fwd_bias_activation_add.hpp @@ -1,13 +1,9 @@ -#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP -#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP +#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP +#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP #include #include -#include "device.hpp" #include "device_base.hpp" -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" #include "host_tensor.hpp" namespace ck { @@ -23,8 +19,7 @@ template -struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K - : public device::BaseOperator +struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator { // Argument struct Argument : public device::BaseArgument @@ -75,9 +70,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Out // Invoker struct Invoker : public device::BaseInvoker { - using Argument = - ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K:: - Argument; + using Argument = ReferenceConvFwd_Bias_Activation_Add::Argument; float Run(const Argument& arg) { @@ -177,7 +170,7 @@ struct ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Out auto str = std::stringstream(); // clang-format off - str << "ReferenceConv2dFwd_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K" + str << "ReferenceConvFwd_Bias_Activation_Add" << std::endl; // clang-format on