From b3049bbcad5529f48827c389564bc4acdb2a48f0 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 6 Apr 2023 07:22:58 -0400 Subject: [PATCH 01/10] Rename to proper naming --- example/42_groupnorm/CMakeLists.txt | 2 +- ...roupnorm_sigmoid_fp16.cpp => groupnorm_sigmoid_mul_fp16.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename example/42_groupnorm/{groupnorm_sigmoid_fp16.cpp => groupnorm_sigmoid_mul_fp16.cpp} (100%) diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt index c3b7b825920..12b05674716 100644 --- a/example/42_groupnorm/CMakeLists.txt +++ b/example/42_groupnorm/CMakeLists.txt @@ -1 +1 @@ -add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp) +add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp similarity index 100% rename from example/42_groupnorm/groupnorm_sigmoid_fp16.cpp rename to example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp From 784bb283231f510a9d8f2fdf35ee703cd423d395 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 6 Apr 2023 07:48:36 -0400 Subject: [PATCH 02/10] Add example of groupnorm + swish --- example/42_groupnorm/CMakeLists.txt | 1 + example/42_groupnorm/groupnorm_swish_fp16.cpp | 158 ++++++++++++++++++ .../element/unary_element_wise_operation.hpp | 19 ++- 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 example/42_groupnorm/groupnorm_swish_fp16.cpp diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt index 12b05674716..a9990c5d890 100644 --- a/example/42_groupnorm/CMakeLists.txt +++ b/example/42_groupnorm/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) +add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp new file mode 100644 index 00000000000..f743d332d0c --- /dev/null +++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using YElementOp = ck::tensor_operation::element_wise::Swish; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector + +int main(int argc, char* argv[]) +{ + ck::index_t N = 2; + ck::index_t H = 32; + ck::index_t W = 32; + ck::index_t G = 32; + ck::index_t C = 30; + + if(argc == 1) + { + // use default case + } + else if(argc == 6) + { + N = std::stoi(argv[1]); + H = std::stoi(argv[2]); + W = std::stoi(argv[3]); + G = std::stoi(argv[4]); + C = std::stoi(argv[5]); + } + else + { + std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; + + return 1; + } + + Tensor x({N, H, W, G, C}); + Tensor y({N, H, W, G, C}); + Tensor gamma({G, C}); + Tensor beta({G, C}); + + ck::utils::FillUniformDistribution{0.f, 1.f}(x); + ck::utils::FillUniformDistribution{0.f, 1.f}(gamma); + ck::utils::FillUniformDistribution{0.f, 1.f}(beta); + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + const auto y_element_op = YElementOp{}; + + auto device_instance = DeviceInstance{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + {N, H, W, G, C}, + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, + {0, 0, 0, C, 1}, + {0, 0, 0, C, 1}, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + {1, 2, 4}, // reduction dimension: [H, W, C] + 1e-6, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + nullptr, + nullptr, + y_element_op); + + if(!device_instance.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters are not supported" << std::endl; + return 1; + }; + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); + + std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + + sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + + sizeof(BetaDataType) * G * C; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, " + << device_instance.GetTypeString() << std::endl; + + bool pass = true; + { + Tensor host_y({N, H, W, G, C}); + using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + y_dev.FromDevice(y.mData.data()); + pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); + } + + return (pass ? 0 : 1); +} diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index f1f3042ad1b..3a82d3aa7a4 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -316,8 +316,6 @@ struct Sigmoid y = 1 / (ck::type_convert(1) + exp(-x)); }; - - int32_t divider_ = 1; }; struct TanH @@ -333,6 +331,23 @@ struct TanH }; }; +struct Swish +{ + Swish(float beta = 1.0f) : beta_(beta) {} + + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = x / (ck::type_convert(1) + exp(-beta_ * x)); + }; + + float beta_ = 1.0f; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck From d651dc85410c504d668adc7d3a7dc78252ac96f7 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 6 Apr 2023 13:18:41 -0400 Subject: [PATCH 03/10] Extract duplicate code in example --- example/42_groupnorm/common.hpp | 23 ++++ .../groupnorm_sigmoid_mul_fp16.cpp | 124 +----------------- example/42_groupnorm/groupnorm_swish_fp16.cpp | 124 +----------------- .../42_groupnorm/run_groupnorm_example.inc | 109 +++++++++++++++ 4 files changed, 138 insertions(+), 242 deletions(-) create mode 100644 example/42_groupnorm/common.hpp create mode 100644 example/42_groupnorm/run_groupnorm_example.inc diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp new file mode 100644 index 00000000000..e159abf3e94 --- /dev/null +++ b/example/42_groupnorm/common.hpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp index 35c7c054e05..b07a26c4c93 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp @@ -1,24 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/fill.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" +#include "common.hpp" constexpr int Rank = 5; constexpr int NumReduceDim = 3; @@ -68,107 +51,6 @@ using DeviceInstance = 2, // BetaScalarPerVector 2>; // OutScalarPerVector -int main(int argc, char* argv[]) -{ - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; - - if(argc == 1) - { - // use default case - } - else if(argc == 6) - { - N = std::stoi(argv[1]); - H = std::stoi(argv[2]); - W = std::stoi(argv[3]); - G = std::stoi(argv[4]); - C = std::stoi(argv[5]); - } - else - { - std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; - - return 1; - } - - Tensor x({N, H, W, G, C}); - Tensor y({N, H, W, G, C}); - Tensor gamma({G, C}); - Tensor beta({G, C}); - - ck::utils::FillUniformDistribution{0.f, 1.f}(x); - ck::utils::FillUniformDistribution{0.f, 1.f}(gamma); - ck::utils::FillUniformDistribution{0.f, 1.f}(beta); - - DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - x_dev.ToDevice(x.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - const auto y_element_op = YElementOp{}; - - auto device_instance = DeviceInstance{}; - auto argument_ptr = device_instance.MakeArgumentPointer( - {N, H, W, G, C}, - std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, - {0, 0, 0, C, 1}, - {0, 0, 0, C, 1}, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - {1, 2, 4}, // reduction dimension: [H, W, C] - 1e-6, - x_dev.GetDeviceBuffer(), - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - nullptr, - nullptr, - y_element_op); - - if(!device_instance.IsSupportedArgument(argument_ptr.get())) - { - std::cout << "The runtime parameters are not supported" << std::endl; - return 1; - }; - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); - - std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + - sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + - sizeof(BetaDataType) * G * C; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, " - << device_instance.GetTypeString() << std::endl; - - bool pass = true; - { - Tensor host_y({N, H, W, G, C}); - using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - - y_dev.FromDevice(y.mData.data()); - pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); - } +#include "run_groupnorm_example.inc" - return (pass ? 0 : 1); -} +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp index f743d332d0c..c52243bfb0c 100644 --- a/example/42_groupnorm/groupnorm_swish_fp16.cpp +++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp @@ -1,24 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/fill.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" +#include "common.hpp" constexpr int Rank = 5; constexpr int NumReduceDim = 3; @@ -52,107 +35,6 @@ using DeviceInstance = 2, // BetaScalarPerVector 2>; // OutScalarPerVector -int main(int argc, char* argv[]) -{ - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; - - if(argc == 1) - { - // use default case - } - else if(argc == 6) - { - N = std::stoi(argv[1]); - H = std::stoi(argv[2]); - W = std::stoi(argv[3]); - G = std::stoi(argv[4]); - C = std::stoi(argv[5]); - } - else - { - std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; - - return 1; - } - - Tensor x({N, H, W, G, C}); - Tensor y({N, H, W, G, C}); - Tensor gamma({G, C}); - Tensor beta({G, C}); - - ck::utils::FillUniformDistribution{0.f, 1.f}(x); - ck::utils::FillUniformDistribution{0.f, 1.f}(gamma); - ck::utils::FillUniformDistribution{0.f, 1.f}(beta); - - DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); - DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); - DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); - DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); - - x_dev.ToDevice(x.mData.data()); - gamma_dev.ToDevice(gamma.mData.data()); - beta_dev.ToDevice(beta.mData.data()); - - const auto y_element_op = YElementOp{}; - - auto device_instance = DeviceInstance{}; - auto argument_ptr = device_instance.MakeArgumentPointer( - {N, H, W, G, C}, - std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, - {0, 0, 0, C, 1}, - {0, 0, 0, C, 1}, - std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, - {1, 2, 4}, // reduction dimension: [H, W, C] - 1e-6, - x_dev.GetDeviceBuffer(), - gamma_dev.GetDeviceBuffer(), - beta_dev.GetDeviceBuffer(), - y_dev.GetDeviceBuffer(), - nullptr, - nullptr, - y_element_op); - - if(!device_instance.IsSupportedArgument(argument_ptr.get())) - { - std::cout << "The runtime parameters are not supported" << std::endl; - return 1; - }; - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); - - std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + - sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + - sizeof(BetaDataType) * G * C; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, " - << device_instance.GetTypeString() << std::endl; - - bool pass = true; - { - Tensor host_y({N, H, W, G, C}); - using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; - - ReferenceInstance ref; - auto ref_argument = - ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6); - auto ref_invoker = ref.MakeInvoker(); - ref_invoker.Run(ref_argument); - - y_dev.FromDevice(y.mData.data()); - pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); - } +#include "run_groupnorm_example.inc" - return (pass ? 0 : 1); -} +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm/run_groupnorm_example.inc new file mode 100644 index 00000000000..03da945830d --- /dev/null +++ b/example/42_groupnorm/run_groupnorm_example.inc @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +int run_groupnorm_example(int argc, char* argv[]) +{ + ck::index_t N = 2; + ck::index_t H = 32; + ck::index_t W = 32; + ck::index_t G = 32; + ck::index_t C = 30; + + if(argc == 1) + { + // use default case + } + else if(argc == 6) + { + N = std::stoi(argv[1]); + H = std::stoi(argv[2]); + W = std::stoi(argv[3]); + G = std::stoi(argv[4]); + C = std::stoi(argv[5]); + } + else + { + std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl; + + return 1; + } + + Tensor x({N, H, W, G, C}); + Tensor y({N, H, W, G, C}); + Tensor gamma({G, C}); + Tensor beta({G, C}); + + ck::utils::FillUniformDistribution{0.f, 1.f}(x); + ck::utils::FillUniformDistribution{0.f, 1.f}(gamma); + ck::utils::FillUniformDistribution{0.f, 1.f}(beta); + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + const auto y_element_op = YElementOp{}; + + auto device_instance = DeviceInstance{}; + auto argument_ptr = device_instance.MakeArgumentPointer( + {N, H, W, G, C}, + std::vector{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()}, + {0, 0, 0, C, 1}, + {0, 0, 0, C, 1}, + std::vector{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()}, + {1, 2, 4}, // reduction dimension: [H, W, C] + 1e-6, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + nullptr, + nullptr, + y_element_op); + + if(!device_instance.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters are not supported" << std::endl; + return 1; + }; + + auto invoker_ptr = device_instance.MakeInvokerPointer(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true}); + + std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C + + sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C + + sizeof(BetaDataType) * G * C; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, " + << device_instance.GetTypeString() << std::endl; + + bool pass = true; + { + Tensor host_y({N, H, W, G, C}); + using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + + y_dev.FromDevice(y.mData.data()); + pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); + } + + return (pass ? 0 : 1); +} From e6e8edefe1aae85578f00a5708f31db2bb413136 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 6 Apr 2023 13:34:27 -0400 Subject: [PATCH 04/10] Add groupnorm + swish instances --- .../device_normalization_f16_instance.cpp | 10 +++++++++- .../device_normalization_f32_instance.cpp | 9 ++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp index beeaa3aa22d..a0fcf23f4a3 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -15,7 +15,8 @@ namespace instance { using F16 = ck::half_t; using F32 = float; -using Pass = ck::tensor_operation::element_wise::PassThrough; +using Pass = ck::tensor_operation::element_wise::PassThrough; +using Swish = ck::tensor_operation::element_wise::Swish; template // clang-format off @@ -64,6 +65,13 @@ void add_device_normalization_rank_5_3_f16_instances( add_device_operation_instances(instances, device_normalization_f16_instances{}); } +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp index 4d236fb6332..2b473922a5e 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -15,7 +15,7 @@ namespace instance { using F32 = float; using Pass = ck::tensor_operation::element_wise::PassThrough; - +using Swish = ck::tensor_operation::element_wise::Swish; template using device_layernorm_f32_instances = std::tuple< // clang-format off @@ -63,6 +63,13 @@ void add_device_normalization_rank_5_3_f32_instances( add_device_operation_instances(instances, device_layernorm_f32_instances{}); } +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation From 30cd8809c3561aa4db0bce564ee6f6a1a991aee7 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 01:33:06 -0400 Subject: [PATCH 05/10] Ractor instance generation, split into multiple cpp file --- .../gpu/normalization/CMakeLists.txt | 10 ++- .../device_groupnorm_f16_instance.cpp | 23 ++++++ .../device_groupnorm_f32_instance.cpp | 23 ++++++ .../device_groupnorm_swish_f16_instance.cpp | 23 ++++++ .../device_groupnorm_swish_f32_instance.cpp | 23 ++++++ .../device_layernorm2d_f16_instance.cpp | 23 ++++++ .../device_layernorm2d_f32_instance.cpp | 23 ++++++ .../device_layernorm4d_f16_instance.cpp | 23 ++++++ .../device_layernorm4d_f32_instance.cpp | 23 ++++++ .../device_normalization_f16_instance.cpp | 78 ------------------- ....cpp => normalization_instance_common.hpp} | 61 +++++++-------- 11 files changed, 222 insertions(+), 111 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp rename library/src/tensor_operation_instance/gpu/normalization/{device_normalization_f32_instance.cpp => normalization_instance_common.hpp} (53%) diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index aa0cc114805..6bed36e350f 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,4 +1,10 @@ add_instance_library(device_normalization_instance - device_normalization_f16_instance.cpp - device_normalization_f32_instance.cpp + device_layernorm2d_f16_instance.cpp + device_layernorm2d_f32_instance.cpp + device_layernorm4d_f16_instance.cpp + device_layernorm4d_f32_instance.cpp + device_groupnorm_f16_instance.cpp + device_groupnorm_f32_instance.cpp + device_groupnorm_swish_f16_instance.cpp + device_groupnorm_swish_f32_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp new file mode 100644 index 00000000000..e9c2112e16e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp new file mode 100644 index 00000000000..79dde38fc90 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp new file mode 100644 index 00000000000..6241e033856 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp new file mode 100644 index 00000000000..b64328d5d07 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp new file mode 100644 index 00000000000..d6a2f6f2c1c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp new file mode 100644 index 00000000000..73097828e3b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp new file mode 100644 index 00000000000..507a683ee7a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp new file mode 100644 index 00000000000..ca1aa0c25ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp deleted file mode 100644 index a0fcf23f4a3..00000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; -using Swish = ck::tensor_operation::element_wise::Swish; - -template -// clang-format off -using device_normalization_f16_instances = - std::tuple < - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - >; -// clang-format on - -void add_device_normalization_rank_2_1_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_4_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_5_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_5_3_swish_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp similarity index 53% rename from library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp index 2b473922a5e..a58fb6ca35a 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp @@ -1,6 +1,8 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" #include "ck/utility/data_type.hpp" @@ -12,12 +14,37 @@ namespace tensor_operation { namespace device { namespace instance { +using F16 = ck::half_t; using F32 = float; -using Pass = ck::tensor_operation::element_wise::PassThrough; -using Swish = ck::tensor_operation::element_wise::Swish; template -using device_layernorm_f32_instances = std::tuple< +using device_normalization_f16_instances = + // clang-format off + std::tuple < + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; + +template +using device_normalization_f32_instances = std::tuple< // clang-format off // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> DeviceNormalizationImpl, // irregular size @@ -42,34 +69,6 @@ using device_layernorm_f32_instances = std::tuple< // clang-format on >; -void add_device_normalization_rank_2_1_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_4_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_5_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_5_3_swish_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation From 7221293cf023102417fa1eaff22144d662d4e741 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 04:50:40 -0400 Subject: [PATCH 06/10] Add external api and client example --- client_example/18_groupnorm/CMakeLists.txt | 2 + .../18_groupnorm/groupnorm_swish.cpp | 169 ++++++++++++++++++ .../device_operation_instance_factory.hpp | 1 + .../gpu/normalization_swish.hpp | 81 +++++++++ 4 files changed, 253 insertions(+) create mode 100644 client_example/18_groupnorm/CMakeLists.txt create mode 100644 client_example/18_groupnorm/groupnorm_swish.cpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp diff --git a/client_example/18_groupnorm/CMakeLists.txt b/client_example/18_groupnorm/CMakeLists.txt new file mode 100644 index 00000000000..17c88cb61bc --- /dev/null +++ b/client_example/18_groupnorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_groupnorm_swish groupnorm_swish.cpp) +target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations) diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp new file mode 100644 index 00000000000..f47349e4cca --- /dev/null +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using Swish = ck::tensor_operation::element_wise::Swish; + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t N = 2; + ck::index_t H = 32; + ck::index_t W = 32; + ck::index_t G = 32; + ck::index_t C = 30; + + std::size_t xy_size = N * H * W * G * C; + std::size_t gamma_beta_size = G * C; + + std::vector xy_strides = {H * W * G * C, W * G * C, G * C, C, 1}; + std::vector gamma_beta_strides = {0, 0, 0, C, 1}; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); + + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = + sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size + + sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index f176cb91e0d..18864395280 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -96,6 +96,7 @@ using FastGelu = ck::tensor_operation::element_wise::FastGelu; using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; using Gelu = ck::tensor_operation::element_wise::Gelu; +using Swish = ck::tensor_operation::element_wise::Swish; template using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp new file mode 100644 index 00000000000..c04a54455d1 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>&); + +// FP32 +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceNormalization> +{ + using DeviceOp = DeviceNormalization; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck From 52625779516ee43db09d6c38a0e9019935ffe4dc Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 05:07:34 -0400 Subject: [PATCH 07/10] Refine profiler message --- profiler/include/profiler/profile_groupnorm_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp index 81fec5590a8..73343f6bec2 100644 --- a/profiler/include/profiler/profile_groupnorm_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_impl.hpp @@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification, if(time_kernel) { - LogRange(std::cout << "length = ", length, ",") << ", "; - std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " - << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; + LogRange(std::cout << "length = ", length, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; } if(num_kernel == 0) From af2004b5fea43604b053493c1f11cb74aa30c660 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 05:56:05 -0400 Subject: [PATCH 08/10] Use ck math version of exp --- .../gpu/element/unary_element_wise_operation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 3a82d3aa7a4..2987def02a6 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -342,7 +342,7 @@ struct Swish is_same::value, "Data type is not supported by this operation!"); - y = x / (ck::type_convert(1) + exp(-beta_ * x)); + y = x / (ck::type_convert(1) + ck::math::exp(-beta_ * x)); }; float beta_ = 1.0f; From a22296fd52f966325eabf6e2057036940a30c4cb Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 05:57:03 -0400 Subject: [PATCH 09/10] Refine problem size in example --- client_example/18_groupnorm/groupnorm_swish.cpp | 10 +++++----- example/42_groupnorm/run_groupnorm_example.inc | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp index f47349e4cca..8a873e6acd3 100644 --- a/client_example/18_groupnorm/groupnorm_swish.cpp +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -40,11 +40,11 @@ struct SimpleDeviceMem int main(int argc, char* argv[]) { - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; std::size_t xy_size = N * H * W * G * C; std::size_t gamma_beta_size = G * C; diff --git a/example/42_groupnorm/run_groupnorm_example.inc b/example/42_groupnorm/run_groupnorm_example.inc index 03da945830d..bd7eb98ca0f 100644 --- a/example/42_groupnorm/run_groupnorm_example.inc +++ b/example/42_groupnorm/run_groupnorm_example.inc @@ -5,11 +5,11 @@ int run_groupnorm_example(int argc, char* argv[]) { - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; if(argc == 1) { From 876a372d3d8363d50d4e80453f68d57f0408d9d2 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 7 Apr 2023 13:22:46 -0400 Subject: [PATCH 10/10] Add host version of exp --- include/ck/utility/math.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp index 12203bd7f31..72071992f65 100644 --- a/include/ck/utility/math.hpp +++ b/include/ck/utility/math.hpp @@ -168,6 +168,10 @@ __device__ double exp(double x) return exp(x); } +static inline __host__ float exp(float x) { return std::expf(x); } + +static inline __host__ double exp(double x) { return std::exp(x); } + // greatest common divisor, aka highest common factor __host__ __device__ constexpr index_t gcd(index_t x, index_t y) {