diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/example/13_pool2d_fwd/CMakeLists.txt index 1fdeb4c5858..db09c03321e 100644 --- a/example/13_pool2d_fwd/CMakeLists.txt +++ b/example/13_pool2d_fwd/CMakeLists.txt @@ -1 +1,3 @@ -add_example_executable(example_pool2d_fwd pool2d_fwd.cpp) +add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp) +add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp) + diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md index 2314cfd6701..9b017734e92 100644 --- a/example/13_pool2d_fwd/README.md +++ b/example/13_pool2d_fwd/README.md @@ -1,12 +1,12 @@ -# Instructions for ```example_pool2d_fwd``` Example +# Instructions for ```example_pool2d_fwd``` Examples -## Run ```example_pool2d_fwd``` +## Run ```example_pool2d_fwd_fp16``` ```bash #arg1: verification (0=no, 1=yes) #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) #arg3: time kernel (0=no, 1=yes) #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx -./bin/example_pool2d_fwd 1 1 1 +./bin/example_pool2d_fwd_fp16 1 1 1 ``` Result @@ -18,3 +18,24 @@ Warm up 1 time Start running 10 times... Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s ``` + +## Run ```example_pool2d_fwd_fp32``` +```bash +#arg1: verification (0=no, 1=yes) +#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg3: time kernel (0=no, 1=yes) +#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx +./bin/example_pool2d_fwd_fp32 1 1 1 +``` + + +Result +``` +./bin/example_pool2d_fwd_fp32 1 1 1 +in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} +out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192} +launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s +``` diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp similarity index 76% rename from example/13_pool2d_fwd/pool2d_fwd.cpp rename to example/13_pool2d_fwd/pool2d_fwd_common.hpp index 662a48500f5..8327478fd57 100644 --- a/example/13_pool2d_fwd/pool2d_fwd.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -1,8 +1,4 @@ #include -#include -#include -#include -#include #include "check_err.hpp" #include "config.hpp" @@ -13,44 +9,13 @@ #include "host_reduce_util.hpp" #include "device_tensor.hpp" #include "tensor_layout.hpp" -#include "reduction_operator.hpp" +#include "reduction_enums.hpp" #include "device_pool2d_fwd_nhwc_nhwc.hpp" -using InDataType = ck::half_t; -using OutDataType = ck::half_t; -using AccDataType = float; - -using IndexDataType = int32_t; - -using InLayout = ck::tensor_layout::convolution::NHWC; -using OutLayout = ck::tensor_layout::convolution::NHWC; - -#if 1 -static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; -#else -static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; -#endif - -static constexpr bool OutputIndex = false; -static constexpr bool PropagateNan = false; - -using DevicePoolFwdInstance = - ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< - InDataType, // InDataType - OutDataType, // OutDataType - AccDataType, // AccDataType - ReduceOpId, - OutputIndex, - 64, // BlockSize - 64, // ReduceMThreadClusterSize - 1, // ReduceKThreadClusterSize - 4, // ReduceMThreadSliceSize - 1, // ReduceKThreadSliceSize - 4>; // InSrcOutDstVectorSize - template @@ -147,68 +112,46 @@ static void pool_host_verify(const Tensor& in, }; } -int main(int argc, char* argv[]) +template +bool pool_test(bool do_verification, + int init_method, + bool time_kernel, + ck::index_t N, + ck::index_t C, + ck::index_t Y, + ck::index_t X, + ck::index_t Hi, + ck::index_t Wi, + ck::index_t window_stride_h, + ck::index_t window_stride_w, + ck::index_t in_left_pad_h, + ck::index_t in_left_pad_w, + ck::index_t in_right_pad_h, + ck::index_t in_right_pad_w) { using namespace ck::host_reduce; - bool do_verification; - int init_method; - bool time_kernel; - - // Pool shape - ck::index_t N = 128; - ck::index_t C = 192; - ck::index_t Y = 3; - ck::index_t X = 3; - ck::index_t Hi = 71; - ck::index_t Wi = 71; - ck::index_t window_stride_h = 2; - ck::index_t window_stride_w = 2; - ck::index_t in_left_pad_h = 1; - ck::index_t in_left_pad_w = 1; - ck::index_t in_right_pad_h = 1; - ck::index_t in_right_pad_w = 1; - - if(argc == 1) - { - do_verification = true; - init_method = 1; - time_kernel = true; - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = static_cast(std::stoi(argv[3])); - } - else if(argc == 16) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = static_cast(std::stoi(argv[3])); - - N = std::stoi(argv[4]); - C = std::stoi(argv[5]); - Y = std::stoi(argv[6]); - X = std::stoi(argv[7]); - Hi = std::stoi(argv[8]); - Wi = std::stoi(argv[9]); - window_stride_h = std::stoi(argv[10]); - window_stride_w = std::stoi(argv[11]); - in_left_pad_h = std::stoi(argv[12]); - in_left_pad_w = std::stoi(argv[13]); - in_right_pad_h = std::stoi(argv[14]); - in_right_pad_w = std::stoi(argv[15]); - } - else - { - printf("arg1: verification (0=no, 1=yes)\n"); - printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); - printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(0); - } + using DevicePoolFwdInstance = + ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C< + InDataType, // InDataType + OutDataType, // OutDataType + AccDataType, // AccDataType + ReduceOpId, + OutputIndex, + 64, // BlockSize + 64, // ReduceMThreadClusterSize + 1, // ReduceKThreadClusterSize + 4, // ReduceMThreadSliceSize + 1, // ReduceKThreadSliceSize + 4>; // InSrcOutDstVectorSize const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1; const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1; @@ -302,6 +245,7 @@ int main(int argc, char* argv[]) pool_host_verify(in_n_c_hi_wi, @@ -325,5 +269,5 @@ int main(int argc, char* argv[]) }; } - return (pass ? 0 : 1); -} + return (pass); +}; diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp new file mode 100644 index 00000000000..624c8ad6cdd --- /dev/null +++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp @@ -0,0 +1,116 @@ +#include +#include + +#include "config.hpp" +#include "tensor_layout.hpp" +#include "reduction_enums.hpp" + +#include "pool2d_fwd_common.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; + +using IndexDataType = int32_t; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using OutLayout = ck::tensor_layout::convolution::NHWC; + +#if 1 +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; +#else +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; +#endif + +static constexpr bool OutputIndex = false; +static constexpr bool PropagateNan = false; + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + bool do_verification; + int init_method; + bool time_kernel; + + // Pool shape + ck::index_t N = 128; + ck::index_t C = 192; + ck::index_t Y = 3; + ck::index_t X = 3; + ck::index_t Hi = 71; + ck::index_t Wi = 71; + ck::index_t window_stride_h = 2; + ck::index_t window_stride_w = 2; + ck::index_t in_left_pad_h = 1; + ck::index_t in_left_pad_w = 1; + ck::index_t in_right_pad_h = 1; + ck::index_t in_right_pad_w = 1; + + if(argc == 1) + { + do_verification = true; + init_method = 1; + time_kernel = true; + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + } + else if(argc == 16) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + + N = std::stoi(argv[4]); + C = std::stoi(argv[5]); + Y = std::stoi(argv[6]); + X = std::stoi(argv[7]); + Hi = std::stoi(argv[8]); + Wi = std::stoi(argv[9]); + window_stride_h = std::stoi(argv[10]); + window_stride_w = std::stoi(argv[11]); + in_left_pad_h = std::stoi(argv[12]); + in_left_pad_w = std::stoi(argv[13]); + in_right_pad_h = std::stoi(argv[14]); + in_right_pad_w = std::stoi(argv[15]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(0); + } + + bool pass = pool_test(do_verification, + init_method, + time_kernel, + N, + C, + Y, + X, + Hi, + Wi, + window_stride_h, + window_stride_w, + in_left_pad_h, + in_left_pad_w, + in_right_pad_h, + in_right_pad_w); + + return (pass ? 0 : 1); +} diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp new file mode 100644 index 00000000000..d2d2ae05d10 --- /dev/null +++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp @@ -0,0 +1,116 @@ +#include +#include + +#include "config.hpp" +#include "tensor_layout.hpp" +#include "reduction_enums.hpp" + +#include "pool2d_fwd_common.hpp" + +using InDataType = float; +using OutDataType = float; +using AccDataType = float; + +using IndexDataType = int32_t; + +using InLayout = ck::tensor_layout::convolution::NHWC; +using OutLayout = ck::tensor_layout::convolution::NHWC; + +#if 1 +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; +#else +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; +#endif + +static constexpr bool OutputIndex = false; +static constexpr bool PropagateNan = false; + +int main(int argc, char* argv[]) +{ + using namespace ck::host_reduce; + + bool do_verification; + int init_method; + bool time_kernel; + + // Pool shape + ck::index_t N = 128; + ck::index_t C = 192; + ck::index_t Y = 3; + ck::index_t X = 3; + ck::index_t Hi = 71; + ck::index_t Wi = 71; + ck::index_t window_stride_h = 2; + ck::index_t window_stride_w = 2; + ck::index_t in_left_pad_h = 1; + ck::index_t in_left_pad_w = 1; + ck::index_t in_right_pad_h = 1; + ck::index_t in_right_pad_w = 1; + + if(argc == 1) + { + do_verification = true; + init_method = 1; + time_kernel = true; + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + } + else if(argc == 16) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = static_cast(std::stoi(argv[3])); + + N = std::stoi(argv[4]); + C = std::stoi(argv[5]); + Y = std::stoi(argv[6]); + X = std::stoi(argv[7]); + Hi = std::stoi(argv[8]); + Wi = std::stoi(argv[9]); + window_stride_h = std::stoi(argv[10]); + window_stride_w = std::stoi(argv[11]); + in_left_pad_h = std::stoi(argv[12]); + in_left_pad_w = std::stoi(argv[13]); + in_right_pad_h = std::stoi(argv[14]); + in_right_pad_w = std::stoi(argv[15]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " + "RightPx\n"); + exit(0); + } + + bool pass = pool_test(do_verification, + init_method, + time_kernel, + N, + C, + Y, + X, + Hi, + Wi, + window_stride_h, + window_stride_w, + in_left_pad_h, + in_left_pad_w, + in_right_pad_h, + in_right_pad_w); + + return (pass ? 0 : 1); +} diff --git a/example/21_lnorm/CMakeLists.txt b/example/21_lnorm/CMakeLists.txt new file mode 100644 index 00000000000..1fc7378d794 --- /dev/null +++ b/example/21_lnorm/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_lnorm_use_reduce lnorm_use_reduce.cpp) diff --git a/example/21_lnorm/README.md b/example/21_lnorm/README.md new file mode 100644 index 00000000000..475f575b45f --- /dev/null +++ b/example/21_lnorm/README.md @@ -0,0 +1,19 @@ +# Instructions for ```example_lnorm_use_reduce``` + +## Run ```example_lnorm_use_reduce``` +```bash +# -D : input 4-d tensor lengths (nhwc layout) +# -v : verification (0=no, 1=yes) +#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +#arg2: time kernel (0=no, 1=yes) +./bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 1 +``` + +Result +``` +./bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 0 +``` +root@dc-smc-18:/data/work/composable_kernel/Build3# bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 0 +Perf: 0 ms, inf GB/s, DeviceReduceMultiBlockAtomicAdd<256,M_C16_S1,K_C16_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> + DeviceReduceMultiBlockAtomicAdd<256,M_C16_S1,K_C16_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> + + diff --git a/example/21_lnorm/lnorm_use_reduce.cpp b/example/21_lnorm/lnorm_use_reduce.cpp new file mode 100644 index 00000000000..0c7921cf67d --- /dev/null +++ b/example/21_lnorm/lnorm_use_reduce.cpp @@ -0,0 +1,382 @@ +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" +#include "device_base.hpp" +#include "device_reduce_multiblock.hpp" +#include "host_common_util.hpp" + +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" + +using namespace ck; +using namespace ck::tensor_operation::device; + +using InDataType = ck::half_t; +using OutDataType = float; +using AccDataType = float; + +// for NHWC layer-norm calculation of mean and meansquare +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +constexpr bool PropagateNan = false; +constexpr bool OutputIndex = false; + +constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::AtomicAdd; + +using ReduceOperation_Mean = ck::reduce::Add; +using InElementwiseOperation_Mean = + ck::tensor_operation::element_wise::UnaryIdentic; +using AccElementwiseOperation_Mean = + ck::tensor_operation::element_wise::UnaryIdentic; + +using ReduceOperation_Meansquare = ck::reduce::Add; +using InElementwiseOperation_Meansquare = + ck::tensor_operation::element_wise::UnarySquare; +using AccElementwiseOperation_Meansquare = + ck::tensor_operation::element_wise::UnaryIdentic; + +using DeviceReduceInstance_Mean = DeviceReduceMultiBlock; + +using DeviceReduceInstance_Meansquare = DeviceReduceMultiBlock; + +static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class SimpleAppArgs +{ + private: + int option_index = 0; + + public: + std::vector inLengths = {512, 28, 28, 256}; + size_t n, h, w, c; + + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + public: + SimpleAppArgs() + { + n = inLengths[0]; + h = inLengths[1]; + w = inLengths[2]; + c = inLengths[3]; + }; + + void show_usage(const char* cmd) + { + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" + << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by " + "comparing with the host-based reduction" + << std::endl; + std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " + "value, 3=decimal value)" + << std::endl; + std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl; + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + if(inLengths.size() != Rank) + throw std::runtime_error( + "Invalid option format! The number of integers is incorrect!"); + + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 2 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind])); + + n = inLengths[0]; + h = inLengths[1]; + w = inLengths[2]; + c = inLengths[3]; + + return (0); + }; +}; + +template +static void mean_meansquare_host(const Tensor& in, + Tensor& mean_ref, + Tensor& meansquare_ref, + size_t n, + size_t h, + size_t w, + size_t c) + +{ + auto thread_reduce_func = [&](auto iN) { + AccDataType mean = type_convert(0.0f); + AccDataType meansquare = type_convert(0.0f); + + // compute mean, meanquare, variance, invVariance + for(std::size_t iH = 0; iH < h; iH++) + { + for(std::size_t iW = 0; iW < w; iW++) + { + for(std::size_t iC = 0; iC < c; iC++) + { + AccDataType curr_value = type_convert(in(iN, iH, iW, iC)); + + mean += curr_value; + meansquare += curr_value * curr_value; + }; + } + }; + + mean = mean / (h * w * c); + meansquare = meansquare / (h * w * c); + + mean_ref(iN) = mean; + meansquare_ref(iN) = meansquare; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = (n + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; it++) + { + std::size_t iN_begin = it * work_per_thread; + std::size_t iN_end = std::min(static_cast((it + 1) * work_per_thread), n); + + auto f = [=] { + for(std::size_t iN = iN_begin; iN < iN_end; iN++) + { + thread_reduce_func(iN); + } + }; + + threads[it] = joinable_thread(f); + } +}; + +int main(int argc, char* argv[]) +{ + // layer-norm calculates mean and meansquare by reducing [N, H, W, C] to [N] + const std::vector reduceDims{1, 2, 3}; + const std::vector invariantDims{0}; + + SimpleAppArgs args; + + if(argc > 1) + { + if(args.processArgs(argc, argv) < 0) + return (-1); + }; + + Tensor in(args.inLengths); + + std::vector outLengths{args.n}; + + Tensor mean_ref(outLengths); + Tensor mean(outLengths); + Tensor meansquare_ref(outLengths); + Tensor meansquare(outLengths); + + auto inStrides = in.mDesc.GetStrides(); + auto outStrides = mean.mDesc.GetStrides(); + + size_t invariant_total_length = args.n; + size_t reduce_total_length = args.h * args.w * args.c; + + const float alpha = 1.0f; + const float beta = 0.0f; + + std::size_t num_thread = 1; + + if(args.do_verification) + { + switch(args.init_method) + { + case 0: break; + case 1: in.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); break; + case 2: in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); break; + default: in.GenerateTensorValue(GeneratorTensor_3{-5.0, 5.0}, num_thread); + } + }; + + // these buffers are usually provided by the user application + DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); + DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpace()); + DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpace()); + + in_dev.ToDevice(in.mData.data()); + + if(args.do_verification) + { + mean_meansquare_host( + in, mean_ref, meansquare_ref, args.n, args.h, args.w, args.c); + }; + + std::vector i_inLengths; + std::vector i_inStrides; + std::vector i_outLengths; + std::vector i_outStrides; + + i_inLengths.assign(args.inLengths.begin(), args.inLengths.end()); + i_inStrides.assign(inStrides.begin(), inStrides.end()); + i_outLengths.assign(outLengths.begin(), outLengths.end()); + i_outStrides.assign(outStrides.begin(), outStrides.end()); + + auto reduce_1 = DeviceReduceInstance_Mean{}; + auto reduce_2 = DeviceReduceInstance_Meansquare{}; + + auto argument_ptr_1 = reduce_1.MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + mean_dev.GetDeviceBuffer(), + nullptr, + InElementwiseOperation_Mean{}, + AccElementwiseOperation_Mean{static_cast(reduce_total_length)}); + + auto argument_ptr_2 = reduce_2.MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + alpha, + beta, + in_dev.GetDeviceBuffer(), + nullptr, + meansquare_dev.GetDeviceBuffer(), + nullptr, + InElementwiseOperation_Meansquare{}, + AccElementwiseOperation_Meansquare{static_cast(reduce_total_length)}); + + if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()) || + !reduce_2.IsSupportedArgument(argument_ptr_2.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + return (-1); + }; + + std::string reduce_name_1 = reduce_1.GetTypeString(); + std::string reduce_name_2 = reduce_2.GetTypeString(); + + auto invoker_ptr_1 = reduce_1.MakeInvokerPointer(); + auto invoker_ptr_2 = reduce_2.MakeInvokerPointer(); + + float avg_time = 0.0f; + + avg_time += invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, args.time_kernel}); + avg_time += invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, args.time_kernel}); + + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + + invariant_total_length * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name_1 + << " + " << reduce_name_2 << std::endl; + + bool pass = true; + + if(args.do_verification) + { + mean_dev.FromDevice(mean.mData.data()); + meansquare_dev.FromDevice(meansquare.mData.data()); + pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData); + pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData); + }; + + return (pass ? 0 : 1); +} diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index e595ca23333..8786dedc000 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -54,3 +54,4 @@ add_subdirectory(16_gemm_reduce) add_subdirectory(18_batched_gemm_reduce) add_subdirectory(19_binary_elementwise) add_subdirectory(20_convnd_bwd_weight_xdl) +add_subdirectory(21_lnorm) diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 5e81c6a469b..0ad78423fe5 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -325,7 +325,7 @@ struct DynamicBuffer { if(is_valid_element) { - atomic_add(c_style_pointer_cast(&p_data_[i]), x); + atomic_add(c_style_pointer_cast(&p_data_[i]), x); } } } diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp index e7a8db8c011..84320f7851e 100644 --- a/include/ck/utility/reduction_operator.hpp +++ b/include/ck/utility/reduction_operator.hpp @@ -46,6 +46,8 @@ namespace reduce { // operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() -- // the first argument of the operator must be both an input & output, and the corresponding variable // usually stores +// 3) operator() -- the first argument of the operator must be both an input & output, and the +// corresponding variable usually stores // the accumulated result of many operator() calls; the second argument is only an // input. For indexable binary // operator, the second version of operator() has third argument (which is an