ROCm · zjing14 · Apr 10, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
@@ -0,0 +1,2 @@
+add_executable(client_groupnorm_swish groupnorm_swish.cpp)
+target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using Swish           = ck::tensor_operation::element_wise::Swish;
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+
+    std::size_t xy_size         = N * H * W * G * C;
+    std::size_t gamma_beta_size = G * C;
+
+    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       ComputeDataType,
+                                                                       YDataType,
+                                                                       Swish,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte =
+                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
+                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
@@ -1 +1,2 @@
-add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
+add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using YElementOp      = ck::tensor_operation::element_wise::Swish;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
@@ -1,80 +1,15 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <getopt.h>
-
-#include "ck/ck.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-
-#include "ck/library/utility/fill.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
-
-constexpr int Rank         = 5;
-constexpr int NumReduceDim = 3;
-
-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-
-struct YElementOp
-{
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
-    {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
-                      "Data type is not supported by this operation!");
-
-        T a;
+#pragma once
 
-        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
-
-        y = x * a;
-    };
-};
-
-using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
-                                                          GammaDataType,
-                                                          BetaDataType,
-                                                          ComputeDataType,
-                                                          YDataType,
-                                                          YElementOp,
-                                                          Rank,
-                                                          NumReduceDim,
-                                                          1024, // BlockSize
-                                                          1,    // ClusterM
-                                                          1024, // ClusterK
-                                                          1,    // SliceM
-                                                          32,   // SliceK
-                                                          1,    // SrcVecDim (0=M, 1=K)
-                                                          2,    // SrcScalarPerVector
-                                                          1,    // GammaVecDim (0=M, 1=K)
-                                                          2,    // GammaScalarPerVector
-                                                          1,    // BetaVecDim (0=M, 1=K)
-                                                          2,    // BetaScalarPerVector
-                                                          2>;   // OutScalarPerVector
-
-int main(int argc, char* argv[])
+int run_groupnorm_example(int argc, char* argv[])
 {
-    ck::index_t N = 2;
-    ck::index_t H = 32;
-    ck::index_t W = 32;
-    ck::index_t G = 32;
-    ck::index_t C = 30;
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
 
     if(argc == 1)
     {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		add_executable(client_groupnorm_swish groupnorm_swish.cpp)
		target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)