From d6e053a3ce8fb8212ec374a519aaec6bd7d43bce Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 4 Apr 2022 04:16:01 +0800
Subject: [PATCH 01/28] Part of gemm + softmax, Add gemm + reduceMax

---
 example/19_gemm_softmax/CMakeLists.txt        |   1 +
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 281 ++++++++++++++++++
 example/CMakeLists.txt                        |   1 +
 3 files changed, 283 insertions(+)
 create mode 100644 example/19_gemm_softmax/CMakeLists.txt
 create mode 100644 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
diff --git a/example/19_gemm_softmax/CMakeLists.txt b/example/19_gemm_softmax/CMakeLists.txt
new file mode 100644
index 00000000000..740f931e5ab
--- /dev/null
+++ b/example/19_gemm_softmax/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_gemm_softmax_xdl_fp16 gemm_softmax_xdl_fp16.cpp)
diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
new file mode 100644
index 00000000000..cb60ae42372
--- /dev/null
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -0,0 +1,281 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_reduce_util.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_c_shuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+#include "device_reduce_blockwise.hpp"
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F16;
+using BDataType   = F16;
+using CDataType   = F16;
+using AccDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle<
+    ADataType,              // ADataType
+    BDataType,              // BDataType
+    CDataType,              // CDataType
+    AccDataType,            // AccDataType
+    CDataType,              // CShuffleDataType
+    ALayout,                // ALayout
+    BLayout,                // BLayout
+    CLayout,                // CLayout
+    PassThrough,            // AElementwiseOperation
+    PassThrough,            // BElementwiseOperation
+    PassThrough,            // CElementwiseOperation
+    256,                    // BlockSize
+    256,                    // MPerBlock
+    128,                    // NPerBlock
+    32,                     // KPerBlock
+    8,                      // AK1
+    8,                      // BK1
+    32,                     // MPerXDL
+    32,                     // NPerXDL
+    4,                      // MXdlPerWave
+    2,                      // NXdlPerWave
+    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
+    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
+    2,                      // ABlockTransferSrcVectorDim
+    8,                      // ABlockTransferSrcScalarPerVector
+    8,                      // ABlockTransferDstScalarPerVector_K1
+    true,                   // ABlockLdsAddExtraM
+    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
+    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
+    2,                      // BBlockTransferSrcVectorDim
+    8,                      // BBlockTransferSrcScalarPerVector
+    8,                      // BBlockTransferDstScalarPerVector_K1
+    true,                   // BBlockLdsAddExtraN
+    1,                      // CShuffleMXdlPerWavePerShuffle
+    1,                      // CShuffleNXdlPerWavePerShuffle
+    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+constexpr int Rank                        = 2;
+constexpr int NumReduceDim                = 1;
+constexpr ck::ReduceTensorOp_t ReduceOpId = ck::ReduceTensorOp_t::MAX;
+constexpr ck::NanPropagation_t NanOpt     = ck::NanPropagation_t::PROPAGATE_NAN;
+constexpr bool PropagateNan = (NanOpt == ck::NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+// constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES;
+using ReduceOperation = typename ck::reduce_binary_operator<CDataType, ReduceOpId>::opType;
+using InElementwiseOperation =
+    typename ck::reduce_unary_operator<CDataType, ReduceOpId, true, true>::InElementwiseOperation;
+using AccElementwiseOperation =
+    typename ck::reduce_unary_operator<CDataType, ReduceOpId, true, true>::AccElementwiseOperation;
+
+using DeviceReduceInstance =
+    ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
+                                                        CDataType,
+                                                        CDataType,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperation,
+                                                        AccElementwiseOperation,
+                                                        PropagateNan,
+                                                        false,
+                                                        256,
+                                                        4,
+                                                        64,
+                                                        1,
+                                                        1,
+                                                        0,
+                                                        1,
+                                                        1>;
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    const std::vector<int> reduceDims{0};
+    const std::vector<int> reduceInvariantDims{1};
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<int> c_m_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                          std::vector<std::size_t>({1}));
+
+    const auto i_inLengths  = ck::to_int_vector(c_m_n.mDesc.GetLengths());
+    const auto i_inStrides  = ck::to_int_vector(c_m_n.mDesc.GetStrides());
+    const auto i_outLengths = ck::to_int_vector(c_m_n_max.mDesc.GetLengths());
+    const auto i_outStrides = ck::to_int_vector(c_m_n_max.mDesc.GetStrides());
+
+    size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_m_n_max.mDesc.GetElementSize();
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n.mDesc << std::endl;
+    std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace());
+    DeviceMem c_m_n_max_indices_dev(0);
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+    // do GEMM
+    auto gemm         = DeviceGemmInstance{};
+    auto gemm_invoker = gemm.MakeInvoker();
+    auto gemm_argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          StrideA,
+                          StrideB,
+                          StrideC,
+                          PassThrough{},
+                          PassThrough{},
+                          PassThrough{});
+
+    if(!gemm.IsSupportedArgument(gemm_argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    gemm_invoker.Run(gemm_argument, nrepeat);
+
+    // do reduce max
+    auto reduce        = DeviceReduceInstance{};
+    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+    DeviceMem ws_dev(wsSizeInBytes);
+
+    auto argument_ptr =
+        reduce.MakeArgumentPointer(i_inLengths,
+                                   i_inStrides,
+                                   i_outLengths,
+                                   i_outStrides,
+                                   reduceDims,
+                                   1,
+                                   0,
+                                   c_m_n_device_buf.GetDeviceBuffer(),
+                                   c_m_n_max_device_buf.GetDeviceBuffer(),
+                                   c_m_n_max_indices_dev.GetDeviceBuffer(),
+                                   ws_dev.GetDeviceBuffer(),
+                                   InElementwiseOperation{static_cast<int>(reduce_total_length)},
+                                   AccElementwiseOperation{static_cast<int>(reduce_total_length)});
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+    // TODO = do_verification
+    if(do_verification)
+        ;
+    return 0;
+}
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 830d1189de5..92d6945a466 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -43,3 +43,4 @@ add_subdirectory(17_convnd_bwd_data_xdl)
 add_subdirectory(15_grouped_gemm)
 add_subdirectory(16_gemm_reduce)
 add_subdirectory(18_batched_gemm_reduce)
+add_subdirectory(19_gemm_softmax)

From cbbc7e52393f31d6e823d7f4aa691b8cb70e659b Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 6 Apr 2022 10:35:12 +0000
Subject: [PATCH 02/28] Refine the comment

---
 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index cb60ae42372..d11d14b79d4 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -274,8 +274,8 @@ int main(int argc, char* argv[])
     auto invoker_ptr = reduce.MakeInvokerPointer();
     invoker_ptr->Run(argument_ptr.get(), nrepeat);
 
+    // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv
     // TODO = do_verification
-    if(do_verification)
-        ;
+    (void)do_verification;
     return 0;
 }

From 3e811ccf0479ddbcca9331aac6ace6778e784db6 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Sun, 10 Apr 2022 18:57:17 +0000
Subject: [PATCH 03/28] Add device op for elementwise 2d

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |  81 ++++++++----
 .../gpu/device/device_elementwise.hpp         |  30 +++++
 .../gpu/device/device_elementwise_2d.hpp      | 120 ++++++++++++++++++
 3 files changed, 208 insertions(+), 23 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index d11d14b79d4..a3124e1742d 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -19,6 +19,7 @@
 #include "device_reduce_blockwise.hpp"
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"
+#include "device_elementwise_2d.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -115,6 +116,17 @@ using DeviceReduceInstance =
                                                         1,
                                                         1>;
 
+struct Sub
+{
+    __host__ __device__ constexpr void operator()(F16& dst, const F16& src1, const F16& src2) const
+    {
+        dst = src1 - src2;
+    }
+};
+
+using DeviceElementwiseInstance =
+    ck::tensor_operation::device::DeviceElementwise_2D<CDataType, CDataType, CDataType, 256, Sub>;
+
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
 
@@ -184,6 +196,7 @@ int main(int argc, char* argv[])
     Tensor<CDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<int> c_m_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
                           std::vector<std::size_t>({1}));
+    Tensor<CDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
     const auto i_inLengths  = ck::to_int_vector(c_m_n.mDesc.GetLengths());
     const auto i_inStrides  = ck::to_int_vector(c_m_n.mDesc.GetStrides());
@@ -196,6 +209,7 @@ int main(int argc, char* argv[])
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
     std::cout << "c_m_n: " << c_m_n.mDesc << std::endl;
     std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
 
     switch(init_method)
     {
@@ -214,6 +228,7 @@ int main(int argc, char* argv[])
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace());
     DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace());
     DeviceMem c_m_n_max_indices_dev(0);
+    DeviceMem d_m_n_device_buf(sizeof(CDataType) * d_m_n.mDesc.GetElementSpace());
 
     a_m_k_device_buf.ToDevice(a_m_k.mData.data());
     b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -245,34 +260,54 @@ int main(int argc, char* argv[])
     gemm_invoker.Run(gemm_argument, nrepeat);
 
     // do reduce max
-    auto reduce        = DeviceReduceInstance{};
-    auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
+    auto reduce_max    = DeviceReduceInstance{};
+    auto wsSizeInBytes = reduce_max.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
     DeviceMem ws_dev(wsSizeInBytes);
 
-    auto argument_ptr =
-        reduce.MakeArgumentPointer(i_inLengths,
-                                   i_inStrides,
-                                   i_outLengths,
-                                   i_outStrides,
-                                   reduceDims,
-                                   1,
-                                   0,
-                                   c_m_n_device_buf.GetDeviceBuffer(),
-                                   c_m_n_max_device_buf.GetDeviceBuffer(),
-                                   c_m_n_max_indices_dev.GetDeviceBuffer(),
-                                   ws_dev.GetDeviceBuffer(),
-                                   InElementwiseOperation{static_cast<int>(reduce_total_length)},
-                                   AccElementwiseOperation{static_cast<int>(reduce_total_length)});
-
-    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        i_outStrides,
+        reduceDims,
+        1,
+        0,
+        c_m_n_device_buf.GetDeviceBuffer(),
+        c_m_n_max_device_buf.GetDeviceBuffer(),
+        c_m_n_max_indices_dev.GetDeviceBuffer(),
+        ws_dev.GetDeviceBuffer(),
+        InElementwiseOperation{static_cast<int>(reduce_total_length)},
+        AccElementwiseOperation{static_cast<int>(reduce_total_length)});
+
+    if(!reduce_max.IsSupportedArgument(reduce_max_argument_ptr.get()))
     {
-        std::cout
-            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
-            << std::endl;
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the DeviceReduce instance, exiting!");
+    };
+
+    auto reduce_max_invoker_ptr = reduce_max.MakeInvokerPointer();
+    reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat);
+
+    // do broadcast sub
+    auto broadcastSub = DeviceElementwiseInstance{};
+    auto broadcastSub_argument_ptr =
+        broadcastSub.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(),
+                                         c_m_n_max_device_buf.GetDeviceBuffer(),
+                                         d_m_n_device_buf.GetDeviceBuffer(),
+                                         {M, N},
+                                         {StrideC, 1},
+                                         {0, 1},
+                                         {StrideC, 1},
+                                         Sub{});
+
+    if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the DeviceElementwise_2D instance, exiting!");
     };
 
-    auto invoker_ptr = reduce.MakeInvokerPointer();
-    invoker_ptr->Run(argument_ptr.get(), nrepeat);
+    auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer();
+    broadcastSub_invoker_ptr->Run(broadcastSub_argument_ptr.get(), nrepeat);
 
     // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv
     // TODO = do_verification
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
new file mode 100644
index 00000000000..e79e1112256
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
@@ -0,0 +1,30 @@
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ElementwiseFunctor>
+struct DeviceElementwise : public BaseOperator
+{
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        const std::vector<int>& shape_a,
+                        const std::vector<int>& stride_a,
+                        const std::vector<int>& shape_b,
+                        const std::vector<int>& stride_b,
+                        ElementwiseFunctor functor) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
new file mode 100644
index 00000000000..e060927ab18
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -0,0 +1,120 @@
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device.hpp"
+#include "device_elementwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          index_t BlockSize,
+          typename ElementwiseFunctor>
+struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
+{
+    static auto Make2dDescriptor_M_N(const std::vector<int>& shape, const std::vector<int>& stride)
+    {
+        return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]),
+                                            make_tuple(stride[0], stride[1]));
+    }
+
+    using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1}));
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a,
+                 const BDataType* p_b,
+                 CDataType* p_c,
+                 const std::vector<int>& shape,
+                 const std::vector<int>& stride_a,
+                 const std::vector<int>& stride_b,
+                 const std::vector<int>& stride_c,
+                 ElementwiseFunctor functor)
+            : p_a_(p_a),
+              p_b_(p_b),
+              p_c_(p_c),
+              a_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_a)),
+              b_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_b)),
+              c_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_c)),
+              functor_(functor)
+        {
+        }
+
+        const ADataType* p_a_;
+        const BDataType* p_b_;
+        CDataType* p_c_;
+        GridDesc_M_N a_grid_desc_m_n_;
+        GridDesc_M_N b_grid_desc_m_n_;
+        GridDesc_M_N c_grid_desc_m_n_;
+        ElementwiseFunctor functor_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, int nrepeat = 1)
+        {
+            // TODO
+            (void)arg;
+            (void)nrepeat;
+            return 0;
+        }
+
+        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        // TODO: properly implement this check
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        return pArg != nullptr;
+    };
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      const std::vector<int>& shape,
+                                                      const std::vector<int>& stride_a,
+                                                      const std::vector<int>& stride_b,
+                                                      const std::vector<int>& stride_c,
+                                                      ElementwiseFunctor functor) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          shape,
+                                          stride_a,
+                                          stride_b,
+                                          stride_c,
+                                          functor);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceElementwise_2D"
+            << "<"
+            << BlockSize
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From 6818b58c93b3825b0fef98181ef131827523cd21 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Sun, 10 Apr 2022 19:06:01 +0000
Subject: [PATCH 04/28] Fix compile error

---
 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index a3124e1742d..cfa26184bff 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -86,9 +86,9 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
 
 constexpr int Rank                        = 2;
 constexpr int NumReduceDim                = 1;
-constexpr ck::ReduceTensorOp_t ReduceOpId = ck::ReduceTensorOp_t::MAX;
-constexpr ck::NanPropagation_t NanOpt     = ck::NanPropagation_t::PROPAGATE_NAN;
-constexpr bool PropagateNan = (NanOpt == ck::NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
+constexpr ck::NanPropagation NanOpt     = ck::NanPropagation::PROPAGATE_NAN;
+constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
 // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES;
 using ReduceOperation = typename ck::reduce_binary_operator<CDataType, ReduceOpId>::opType;
 using InElementwiseOperation =

From e3a09b57558a3211ac6dc51ace5ca17696fb1dfe Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 11 Apr 2022 10:06:05 +0000
Subject: [PATCH 05/28] Add gridwise_elementwise_2d api

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |  2 +-
 .../gpu/device/device_elementwise_2d.hpp      | 93 +++++++++++++++++--
 .../gpu/grid/gridwise_elementwise_2d.hpp      | 82 ++++++++++++++++
 3 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index cfa26184bff..af8152e9d9f 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -125,7 +125,7 @@ struct Sub
 };
 
 using DeviceElementwiseInstance =
-    ck::tensor_operation::device::DeviceElementwise_2D<CDataType, CDataType, CDataType, 256, Sub>;
+    ck::tensor_operation::device::DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub, 16, 16, 8, 8>;
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
index e060927ab18..cabdcc6b5e8 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -4,6 +4,7 @@
 
 #include "device.hpp"
 #include "device_elementwise.hpp"
+#include "gridwise_elementwise_2d.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -12,17 +13,36 @@ namespace device {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          index_t BlockSize,
-          typename ElementwiseFunctor>
+          typename ElementwiseFunctor,
+          index_t MThreadPerBlock,
+          index_t NThreadPerBlock,
+          index_t MThreadTileSize,
+          index_t NThreadTileSize>
 struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 {
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
     static auto Make2dDescriptor_M_N(const std::vector<int>& shape, const std::vector<int>& stride)
     {
         return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]),
                                             make_tuple(stride[0], stride[1]));
     }
 
-    using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1}));
+    static constexpr index_t BlockSize   = MThreadPerBlock * NThreadPerBlock;
+    static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize;
+    static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize;
+
+    using GridDesc_M_N    = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1}));
+    using GridwiseEltwise = GridwiseElementwise_2D<ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   GridDesc_M_N,
+                                                   GridDesc_M_N,
+                                                   GridDesc_M_N,
+                                                   ElementwiseFunctor,
+                                                   MThreadTileSize,
+                                                   NThreadTileSize>;
 
     struct Argument : public BaseArgument
     {
@@ -55,12 +75,63 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 
     struct Invoker : public BaseInvoker
     {
+        index_t CalculateGridSize(const GridDesc_M_N& grid_desc_m_n)
+        {
+            const auto M = grid_desc_m_n.GetLength(I0);
+            const auto N = grid_desc_m_n.GetLength(I1);
+
+            assert(M % M_BlockTileSize == 0);
+            assert(N % N_BlockTileSize == 0);
+
+            return (M / M_BlockTileSize) * (N / N_BlockTileSize);
+        }
+
         float Run(const Argument& arg, int nrepeat = 1)
         {
+            const auto kernel = kernel_elementwise_2d<GridwiseEltwise,
+                                                      ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      GridDesc_M_N,
+                                                      GridDesc_M_N,
+                                                      GridDesc_M_N,
+                                                      ElementwiseFunctor>;
             // TODO
             (void)arg;
             (void)nrepeat;
-            return 0;
+            (void)kernel;
+            float avgTime          = 0;
+            const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_);
+            if(nrepeat == 0)
+            {
+                launch_kernel(kernel,
+                              dim3(gridSize),
+                              dim3(BlockSize),
+                              0,
+                              arg.p_a_,
+                              arg.p_b_,
+                              arg.p_c_,
+                              arg.a_grid_desc_m_n_,
+                              arg.b_grid_desc_m_n_,
+                              arg.c_grid_desc_m_n_,
+                              arg.functor_);
+            }
+            else
+            {
+                avgTime = launch_and_time_kernel(kernel,
+                                                 nrepeat,
+                                                 dim3(gridSize),
+                                                 dim3(BlockSize),
+                                                 0,
+                                                 arg.p_a_,
+                                                 arg.p_b_,
+                                                 arg.p_c_,
+                                                 arg.a_grid_desc_m_n_,
+                                                 arg.b_grid_desc_m_n_,
+                                                 arg.c_grid_desc_m_n_,
+                                                 arg.functor_);
+            }
+            return avgTime;
         }
 
         float Run(const BaseArgument* p_arg, int nrepeat = 1) override
@@ -71,9 +142,18 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
     {
-        // TODO: properly implement this check
         const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
-        return pArg != nullptr;
+
+        if(pArg == nullptr)
+            return false;
+
+        const auto M = pArg->c_grid_desc_m_n_.GetLength(I0);
+        const auto N = pArg->c_grid_desc_m_n_.GetLength(I1);
+
+        if(M % M_BlockTileSize != 0 && N % N_BlockTileSize != 0)
+            return false;
+
+        return true;
     };
 
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
@@ -107,7 +187,6 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
         // clang-format off
         str << "DeviceElementwise_2D"
             << "<"
-            << BlockSize
             << ">";
         // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
new file mode 100644
index 00000000000..d8d55ee56fc
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -0,0 +1,82 @@
+#pragma once
+
+#include "cluster_descriptor.hpp"
+#include "data_type.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename GridwiseEltwise,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AGridDesc_M_N,
+          typename BGridDesc_M_N,
+          typename CGridDesc_M_N,
+          typename ElementwiseFunctor>
+__global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global,
+                                      const BDataType* __restrict__ p_b_global,
+                                      CDataType* __restrict__ p_c_global,
+                                      const AGridDesc_M_N a_grid_desc_m_k,
+                                      const BGridDesc_M_N b_grid_desc_m_k,
+                                      const CGridDesc_M_N c_grid_desc_m_k,
+                                      const ElementwiseFunctor functor)
+{
+    GridwiseEltwise::Run(p_a_global,
+                         p_b_global,
+                         p_c_global,
+                         a_grid_desc_m_k,
+                         b_grid_desc_m_k,
+                         c_grid_desc_m_k,
+                         functor);
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AGridDesc_M_N,
+          typename BGridDesc_M_N,
+          typename CGridDesc_M_N,
+          typename ElementwiseFunctor,
+          index_t MThreadTileSize,
+          index_t NThreadTileSize>
+struct GridwiseElementwise_2D
+{
+    __device__ static void Run(const ADataType* __restrict__ p_a_global,
+                               const BDataType* __restrict__ p_b_global,
+                               CDataType* __restrict__ p_c_global,
+                               const AGridDesc_M_N a_grid_desc_m_n,
+                               const BGridDesc_M_N b_grid_desc_m_n,
+                               const CGridDesc_M_N c_grid_desc_m_n,
+                               const ElementwiseFunctor functor)
+    {
+        // const index_t thread_id = get_thread_local_1d_id();
+        // const index_t block_id = get_block_1d_id();
+        // printf("block_id = %d, thread_id = %d \n", block_id, thread_id);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_grid_desc_m_n.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_grid_desc_m_n.GetElementSpaceSize());
+        const auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_grid_desc_m_n.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, MThreadTileSize * NThreadTileSize, true>
+            a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, MThreadTileSize * NThreadTileSize, true>
+            b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, CDataType, MThreadTileSize * NThreadTileSize, true>
+            c_thread_buf;
+
+        // TODO - buffer_load, apply functor, buffer_store
+        (void)a_global_buf;
+        (void)b_global_buf;
+        (void)c_global_buf;
+        (void)a_thread_buf;
+        (void)b_thread_buf;
+        (void)c_thread_buf;
+        (void)functor;
+    }
+};
+
+} // namespace ck

From a760a73213f3352f97688e6eb4eaddcdba79f6e1 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Tue, 12 Apr 2022 19:55:43 +0000
Subject: [PATCH 06/28] A kernel of elementwise_2d (except global store)

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |  14 +--
 .../gpu/device/device_elementwise_2d.hpp      |  29 +++--
 .../gpu/grid/gridwise_elementwise_2d.hpp      | 117 ++++++++++++++----
 3 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index af8152e9d9f..dd5c32ade61 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -84,8 +84,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-constexpr int Rank                        = 2;
-constexpr int NumReduceDim                = 1;
+constexpr int Rank                      = 2;
+constexpr int NumReduceDim              = 1;
 constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
 constexpr ck::NanPropagation NanOpt     = ck::NanPropagation::PROPAGATE_NAN;
 constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
@@ -118,14 +118,14 @@ using DeviceReduceInstance =
 
 struct Sub
 {
-    __host__ __device__ constexpr void operator()(F16& dst, const F16& src1, const F16& src2) const
+    __host__ __device__ constexpr void operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
     {
         dst = src1 - src2;
     }
 };
 
-using DeviceElementwiseInstance =
-    ck::tensor_operation::device::DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub, 16, 16, 8, 8>;
+using DeviceElementwiseInstance = ck::tensor_operation::device::
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
@@ -302,8 +302,8 @@ int main(int argc, char* argv[])
 
     if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get()))
     {
-        throw std::runtime_error(
-            "The runtime parameters seems not supported by the DeviceElementwise_2D instance, exiting!");
+        throw std::runtime_error("The runtime parameters seems not supported by the "
+                                 "DeviceElementwise_2D instance, exiting!");
     };
 
     auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer();
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
index cabdcc6b5e8..db9974b4a00 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -17,9 +17,17 @@ template <typename ADataType,
           index_t MThreadPerBlock,
           index_t NThreadPerBlock,
           index_t MThreadTileSize,
-          index_t NThreadTileSize>
+          index_t NThreadTileSize,
+          index_t AThreadTransferSrcVectorDim,
+          index_t AThreadTransferSrcScalarPerVector,
+          index_t BThreadTransferSrcVectorDim,
+          index_t BThreadTransferSrcScalarPerVector,
+          index_t CThreadTransferSrcScalarPerVector>
 struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 {
+    static_assert(NThreadTileSize % AThreadTransferSrcScalarPerVector == 0 &&
+                  NThreadTileSize % BThreadTransferSrcScalarPerVector == 0);
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
@@ -38,11 +46,16 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
                                                    BDataType,
                                                    CDataType,
                                                    GridDesc_M_N,
-                                                   GridDesc_M_N,
-                                                   GridDesc_M_N,
                                                    ElementwiseFunctor,
+                                                   MThreadPerBlock,
+                                                   NThreadPerBlock,
                                                    MThreadTileSize,
-                                                   NThreadTileSize>;
+                                                   NThreadTileSize,
+                                                   AThreadTransferSrcVectorDim,
+                                                   AThreadTransferSrcScalarPerVector,
+                                                   BThreadTransferSrcVectorDim,
+                                                   BThreadTransferSrcScalarPerVector,
+                                                   CThreadTransferSrcScalarPerVector>;
 
     struct Argument : public BaseArgument
     {
@@ -88,18 +101,12 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 
         float Run(const Argument& arg, int nrepeat = 1)
         {
-            const auto kernel = kernel_elementwise_2d<GridwiseEltwise,
+            const auto kernel      = kernel_elementwise_2d<GridwiseEltwise,
                                                       ADataType,
                                                       BDataType,
                                                       CDataType,
                                                       GridDesc_M_N,
-                                                      GridDesc_M_N,
-                                                      GridDesc_M_N,
                                                       ElementwiseFunctor>;
-            // TODO
-            (void)arg;
-            (void)nrepeat;
-            (void)kernel;
             float avgTime          = 0;
             const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_);
             if(nrepeat == 0)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index d8d55ee56fc..97a6d0ab261 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -10,16 +10,14 @@ template <typename GridwiseEltwise,
           typename ADataType,
           typename BDataType,
           typename CDataType,
-          typename AGridDesc_M_N,
-          typename BGridDesc_M_N,
-          typename CGridDesc_M_N,
+          typename GridDesc_M_N,
           typename ElementwiseFunctor>
 __global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global,
                                       const BDataType* __restrict__ p_b_global,
                                       CDataType* __restrict__ p_c_global,
-                                      const AGridDesc_M_N a_grid_desc_m_k,
-                                      const BGridDesc_M_N b_grid_desc_m_k,
-                                      const CGridDesc_M_N c_grid_desc_m_k,
+                                      const GridDesc_M_N a_grid_desc_m_k,
+                                      const GridDesc_M_N b_grid_desc_m_k,
+                                      const GridDesc_M_N c_grid_desc_m_k,
                                       const ElementwiseFunctor functor)
 {
     GridwiseEltwise::Run(p_a_global,
@@ -34,26 +32,58 @@ __global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global,
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          typename AGridDesc_M_N,
-          typename BGridDesc_M_N,
-          typename CGridDesc_M_N,
+          typename GridDesc_M_N,
           typename ElementwiseFunctor,
+          index_t MThreadPerBlock,
+          index_t NThreadPerBlock,
           index_t MThreadTileSize,
-          index_t NThreadTileSize>
+          index_t NThreadTileSize,
+          index_t AThreadTransferSrcVectorDim,
+          index_t AThreadTransferSrcScalarPerVector,
+          index_t BThreadTransferSrcVectorDim,
+          index_t BThreadTransferSrcScalarPerVector,
+          index_t CThreadTransferSrcScalarPerVector>
 struct GridwiseElementwise_2D
 {
+    static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadTileSize>{}, Number<NThreadTileSize>{}));
+
+    using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize;
+    static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize;
+
+    static __device__ __host__ auto CalculateElementwiseIndex(const GridDesc_M_N& grid_desc_m_n)
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t block_id  = get_block_1d_id();
+
+        const index_t M              = grid_desc_m_n.GetLength(I0);
+        const index_t gridSize_m     = M / M_BlockTileSize;
+        const index_t block_2d_idx_m = block_id % gridSize_m;
+        const index_t block_2d_idx_n = block_id / gridSize_m;
+
+        constexpr auto thread_desc =
+            make_cluster_descriptor(Sequence<MThreadPerBlock, NThreadPerBlock>{}, Sequence<1, 0>{});
+
+        const auto thread_2d_idx = thread_desc.CalculateBottomIndex(make_multi_index(thread_id));
+
+        return make_multi_index(
+            block_2d_idx_m * M_BlockTileSize + thread_2d_idx[I0] * MThreadTileSize,
+            block_2d_idx_n * N_BlockTileSize + thread_2d_idx[I1] * NThreadTileSize);
+    }
+
     __device__ static void Run(const ADataType* __restrict__ p_a_global,
                                const BDataType* __restrict__ p_b_global,
                                CDataType* __restrict__ p_c_global,
-                               const AGridDesc_M_N a_grid_desc_m_n,
-                               const BGridDesc_M_N b_grid_desc_m_n,
-                               const CGridDesc_M_N c_grid_desc_m_n,
+                               const GridDesc_M_N a_grid_desc_m_n,
+                               const GridDesc_M_N b_grid_desc_m_n,
+                               const GridDesc_M_N c_grid_desc_m_n,
                                const ElementwiseFunctor functor)
     {
-        // const index_t thread_id = get_thread_local_1d_id();
-        // const index_t block_id = get_block_1d_id();
-        // printf("block_id = %d, thread_id = %d \n", block_id, thread_id);
-
         const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_global, a_grid_desc_m_n.GetElementSpaceSize());
         const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -68,14 +98,53 @@ struct GridwiseElementwise_2D
         StaticBuffer<AddressSpaceEnum::Vgpr, CDataType, MThreadTileSize * NThreadTileSize, true>
             c_thread_buf;
 
-        // TODO - buffer_load, apply functor, buffer_store
-        (void)a_global_buf;
-        (void)b_global_buf;
+        const auto a_global_load_offset = CalculateElementwiseIndex(a_grid_desc_m_n);
+        const auto b_global_load_offset = CalculateElementwiseIndex(b_grid_desc_m_n);
+
+        auto a_global_load = ThreadwiseTensorSliceTransfer_v2<
+            ADataType,
+            ADataType,
+            GridDesc_M_N,
+            decltype(thread_buf_desc_M_N),
+            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
+            Sequence<0, 1>,                             // DimAccessOrder
+            AThreadTransferSrcVectorDim,
+            AThreadTransferSrcScalarPerVector,
+            1, // SrcScalarStrideInVector
+            false>{a_grid_desc_m_n, a_global_load_offset};
+
+        auto b_global_load = ThreadwiseTensorSliceTransfer_v2<
+            BDataType,
+            BDataType,
+            GridDesc_M_N,
+            decltype(thread_buf_desc_M_N),
+            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
+            Sequence<0, 1>,                             // DimAccessOrder
+            BThreadTransferSrcVectorDim,
+            BThreadTransferSrcScalarPerVector,
+            1, // SrcScalarStrideInVector
+            false>{b_grid_desc_m_n, b_global_load_offset};
+
+        a_global_load.Run(
+            a_grid_desc_m_n, a_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), a_thread_buf);
+
+        b_global_load.Run(
+            b_grid_desc_m_n, b_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), b_thread_buf);
+
+        static_for<0, MThreadTileSize, 1>{}([&](auto m) {
+            static_for<0, NThreadTileSize, 1>{}([&](auto n) {
+                constexpr auto offset = thread_buf_desc_M_N.CalculateOffset(make_tuple(m, n));
+                functor(c_thread_buf(Number<offset>{}),
+                        a_thread_buf(Number<offset>{}),
+                        b_thread_buf(Number<offset>{}));
+            });
+        });
+
+        // TODO - global write
         (void)c_global_buf;
-        (void)a_thread_buf;
-        (void)b_thread_buf;
-        (void)c_thread_buf;
-        (void)functor;
+        // c_global_write.Run(
+        //     thread_buf_desc_M_N, c_thread_buf, c_grid_desc_m_n, make_tuple(I0, I0),
+        //     c_global_buf);
     }
 };
 

From c8b4ac223637f5f8cb24d4e374ec8ef2a88155fd Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 13 Apr 2022 12:28:32 +0000
Subject: [PATCH 07/28] Add global write

---
 .../gpu/grid/gridwise_elementwise_2d.hpp      | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
index 97a6d0ab261..713411c63a4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -2,6 +2,7 @@
 
 #include "cluster_descriptor.hpp"
 #include "data_type.hpp"
+#include "element_wise_operation.hpp"
 #include "threadwise_tensor_slice_transfer.hpp"
 
 namespace ck {
@@ -48,6 +49,7 @@ struct GridwiseElementwise_2D
     static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed(
         make_tuple(Number<MThreadTileSize>{}, Number<NThreadTileSize>{}));
 
+    using PassThrough       = tensor_operation::element_wise::PassThrough;
     using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N);
 
     static constexpr auto I0 = Number<0>{};
@@ -88,7 +90,7 @@ struct GridwiseElementwise_2D
             p_a_global, a_grid_desc_m_n.GetElementSpaceSize());
         const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_b_global, b_grid_desc_m_n.GetElementSpaceSize());
-        const auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_global, c_grid_desc_m_n.GetElementSpaceSize());
 
         StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, MThreadTileSize * NThreadTileSize, true>
@@ -141,10 +143,23 @@ struct GridwiseElementwise_2D
         });
 
         // TODO - global write
-        (void)c_global_buf;
-        // c_global_write.Run(
-        //     thread_buf_desc_M_N, c_thread_buf, c_grid_desc_m_n, make_tuple(I0, I0),
-        //     c_global_buf);
+        const auto c_global_write_offset = CalculateElementwiseIndex(c_grid_desc_m_n);
+        auto c_global_write              = ThreadwiseTensorSliceTransfer_v1r3<
+            CDataType,
+            CDataType,
+            decltype(thread_buf_desc_M_N),
+            GridDesc_M_N,
+            PassThrough,
+            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
+            Sequence<0, 1>,                             // DimAccessOrder
+            1,                                          // DstVectorDim
+            CThreadTransferSrcScalarPerVector,          // DstScalarPerVector
+            InMemoryDataOperationEnum::Set,             // DstInMemOp
+            1,                                          // DstScalarStrideInVector
+            false>{c_grid_desc_m_n, c_global_write_offset, PassThrough{}};
+
+        c_global_write.Run(
+            thread_buf_desc_M_N, make_tuple(I0, I0), c_thread_buf, c_grid_desc_m_n, c_global_buf);
     }
 };
 

From f2540aa5b4af1658e28607150ae285ef9736df43 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 13 Apr 2022 12:56:00 +0000
Subject: [PATCH 08/28] Add exponential

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index dd5c32ade61..eabc03e23e0 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -4,6 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <half.hpp>
+#include <math.h>
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
@@ -116,16 +117,19 @@ using DeviceReduceInstance =
                                                         1,
                                                         1>;
 
-struct Sub
+struct Sub_Exp
 {
     __host__ __device__ constexpr void operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
     {
         dst = src1 - src2;
+        // FIXME - use float16 exponential
+        float dst_f32 = static_cast<float>(dst);
+        dst = static_cast<CDataType>(exp(dst_f32));
     }
 };
 
 using DeviceElementwiseInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub_Exp, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
@@ -289,25 +293,25 @@ int main(int argc, char* argv[])
     reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat);
 
     // do broadcast sub
-    auto broadcastSub = DeviceElementwiseInstance{};
-    auto broadcastSub_argument_ptr =
-        broadcastSub.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(),
+    auto broadcastSubExp = DeviceElementwiseInstance{};
+    auto broadcastSubExp_argument_ptr =
+        broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(),
                                          c_m_n_max_device_buf.GetDeviceBuffer(),
                                          d_m_n_device_buf.GetDeviceBuffer(),
                                          {M, N},
                                          {StrideC, 1},
                                          {0, 1},
                                          {StrideC, 1},
-                                         Sub{});
+                                         Sub_Exp{});
 
-    if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get()))
+    if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get()))
     {
         throw std::runtime_error("The runtime parameters seems not supported by the "
                                  "DeviceElementwise_2D instance, exiting!");
     };
 
-    auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer();
-    broadcastSub_invoker_ptr->Run(broadcastSub_argument_ptr.get(), nrepeat);
+    auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer();
+    broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat);
 
     // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv
     // TODO = do_verification

From 30348daad693624a6056c559085a50fa18cdf090 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 13 Apr 2022 12:59:36 +0000
Subject: [PATCH 09/28] [What] Refine naming [Why] Prepare to add reduceSum

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index eabc03e23e0..2313c7af6f5 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -85,25 +85,25 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-constexpr int Rank                      = 2;
+constexpr int ReduceRank                      = 2;
 constexpr int NumReduceDim              = 1;
-constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
+constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX;
 constexpr ck::NanPropagation NanOpt     = ck::NanPropagation::PROPAGATE_NAN;
 constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
 // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES;
-using ReduceOperation = typename ck::reduce_binary_operator<CDataType, ReduceOpId>::opType;
+using ReduceMaxOp = typename ck::reduce_binary_operator<CDataType, ReduceMaxId>::opType;
 using InElementwiseOperation =
-    typename ck::reduce_unary_operator<CDataType, ReduceOpId, true, true>::InElementwiseOperation;
+    typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::InElementwiseOperation;
 using AccElementwiseOperation =
-    typename ck::reduce_unary_operator<CDataType, ReduceOpId, true, true>::AccElementwiseOperation;
+    typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::AccElementwiseOperation;
 
-using DeviceReduceInstance =
+using DeviceReduceMaxInstance =
     ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
                                                         CDataType,
                                                         CDataType,
-                                                        Rank,
+                                                        ReduceRank,
                                                         NumReduceDim,
-                                                        ReduceOperation,
+                                                        ReduceMaxOp,
                                                         InElementwiseOperation,
                                                         AccElementwiseOperation,
                                                         PropagateNan,
@@ -264,7 +264,7 @@ int main(int argc, char* argv[])
     gemm_invoker.Run(gemm_argument, nrepeat);
 
     // do reduce max
-    auto reduce_max    = DeviceReduceInstance{};
+    auto reduce_max    = DeviceReduceMaxInstance{};
     auto wsSizeInBytes = reduce_max.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
     DeviceMem ws_dev(wsSizeInBytes);
 

From b05a594e528e0feba663657ec551a32d55e8cce3 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 13 Apr 2022 19:52:38 +0000
Subject: [PATCH 10/28] Add reduce sum for denominator of softmax

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 146 ++++++++++++------
 1 file changed, 103 insertions(+), 43 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 2313c7af6f5..0e86e55c1b4 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -85,27 +85,53 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-constexpr int ReduceRank                      = 2;
-constexpr int NumReduceDim              = 1;
+constexpr int Rank                       = 2;
+constexpr int NumReduceDim               = 1;
 constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX;
-constexpr ck::NanPropagation NanOpt     = ck::NanPropagation::PROPAGATE_NAN;
+constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD;
+constexpr ck::NanPropagation NanOpt      = ck::NanPropagation::PROPAGATE_NAN;
 constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
 // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES;
 using ReduceMaxOp = typename ck::reduce_binary_operator<CDataType, ReduceMaxId>::opType;
-using InElementwiseOperation =
+using ReduceSumOp = typename ck::reduce_binary_operator<CDataType, ReduceSumId>::opType;
+using ReduceMaxInElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::InElementwiseOperation;
-using AccElementwiseOperation =
+using ReduceMaxAccElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::AccElementwiseOperation;
+using ReduceSumInElementwiseOperation =
+    typename ck::reduce_unary_operator<CDataType, ReduceSumId, true, true>::InElementwiseOperation;
+using ReduceSumAccElementwiseOperation =
+    typename ck::reduce_unary_operator<CDataType, ReduceSumId, true, true>::AccElementwiseOperation;
 
 using DeviceReduceMaxInstance =
     ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
                                                         CDataType,
                                                         CDataType,
-                                                        ReduceRank,
+                                                        Rank,
                                                         NumReduceDim,
                                                         ReduceMaxOp,
-                                                        InElementwiseOperation,
-                                                        AccElementwiseOperation,
+                                                        ReduceMaxInElementwiseOperation,
+                                                        ReduceMaxAccElementwiseOperation,
+                                                        PropagateNan,
+                                                        false,
+                                                        256,
+                                                        4,
+                                                        64,
+                                                        1,
+                                                        1,
+                                                        0,
+                                                        1,
+                                                        1>;
+
+using DeviceReduceSumInstance =
+    ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
+                                                        CDataType,
+                                                        CDataType,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceSumOp,
+                                                        ReduceSumInElementwiseOperation,
+                                                        ReduceSumAccElementwiseOperation,
                                                         PropagateNan,
                                                         false,
                                                         256,
@@ -119,12 +145,13 @@ using DeviceReduceMaxInstance =
 
 struct Sub_Exp
 {
-    __host__ __device__ constexpr void operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
+    __host__ __device__ constexpr void
+    operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
     {
         dst = src1 - src2;
         // FIXME - use float16 exponential
         float dst_f32 = static_cast<float>(dst);
-        dst = static_cast<CDataType>(exp(dst_f32));
+        dst           = static_cast<CDataType>(exp(dst_f32));
     }
 };
 
@@ -198,22 +225,25 @@ int main(int argc, char* argv[])
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<int> c_m_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
-                          std::vector<std::size_t>({1}));
-    Tensor<CDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<int> c_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                        std::vector<std::size_t>({1}));
+    Tensor<CDataType> exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> exp_n_sum(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                                std::vector<std::size_t>({1}));
 
-    const auto i_inLengths  = ck::to_int_vector(c_m_n.mDesc.GetLengths());
-    const auto i_inStrides  = ck::to_int_vector(c_m_n.mDesc.GetStrides());
-    const auto i_outLengths = ck::to_int_vector(c_m_n_max.mDesc.GetLengths());
-    const auto i_outStrides = ck::to_int_vector(c_m_n_max.mDesc.GetStrides());
+    const auto c_m_n_shape  = ck::to_int_vector(c_m_n.mDesc.GetLengths());
+    const auto c_m_n_stride  = ck::to_int_vector(c_m_n.mDesc.GetStrides());
+    const auto reduce_n_shape = ck::to_int_vector(c_n_max.mDesc.GetLengths());
+    const auto reduce_n_stride = ck::to_int_vector(c_n_max.mDesc.GetStrides());
 
-    size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_m_n_max.mDesc.GetElementSize();
+    size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_n_max.mDesc.GetElementSize();
 
     std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
     std::cout << "c_m_n: " << c_m_n.mDesc << std::endl;
-    std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl;
-    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "c_n_max: " << c_n_max.mDesc << std::endl;
+    std::cout << "exp_m_n: " << exp_m_n.mDesc << std::endl;
+    std::cout << "exp_n_sum: " << exp_n_sum.mDesc << std::endl;
 
     switch(init_method)
     {
@@ -230,9 +260,10 @@ int main(int argc, char* argv[])
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
     DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
     DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace());
-    DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace());
-    DeviceMem c_m_n_max_indices_dev(0);
-    DeviceMem d_m_n_device_buf(sizeof(CDataType) * d_m_n.mDesc.GetElementSpace());
+    DeviceMem c_n_max_device_buf(sizeof(CDataType) * c_n_max.mDesc.GetElementSpace());
+    DeviceMem indices_device_buf(0);
+    DeviceMem exp_m_n_device_buf(sizeof(CDataType) * exp_m_n.mDesc.GetElementSpace());
+    DeviceMem exp_n_sum_device_buf(sizeof(CDataType) * exp_n_sum.mDesc.GetElementSpace());
 
     a_m_k_device_buf.ToDevice(a_m_k.mData.data());
     b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -265,23 +296,23 @@ int main(int argc, char* argv[])
 
     // do reduce max
     auto reduce_max    = DeviceReduceMaxInstance{};
-    auto wsSizeInBytes = reduce_max.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-    DeviceMem ws_dev(wsSizeInBytes);
+    auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
+    DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size);
 
     auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer(
-        i_inLengths,
-        i_inStrides,
-        i_outLengths,
-        i_outStrides,
+        c_m_n_shape,
+        c_m_n_stride,
+        reduce_n_shape,
+        reduce_n_stride,
         reduceDims,
         1,
         0,
         c_m_n_device_buf.GetDeviceBuffer(),
-        c_m_n_max_device_buf.GetDeviceBuffer(),
-        c_m_n_max_indices_dev.GetDeviceBuffer(),
-        ws_dev.GetDeviceBuffer(),
-        InElementwiseOperation{static_cast<int>(reduce_total_length)},
-        AccElementwiseOperation{static_cast<int>(reduce_total_length)});
+        c_n_max_device_buf.GetDeviceBuffer(),
+        indices_device_buf.GetDeviceBuffer(),
+        reduce_max_workaspace_device_buf.GetDeviceBuffer(),
+        ReduceMaxInElementwiseOperation{static_cast<int>(reduce_total_length)},
+        ReduceMaxAccElementwiseOperation{static_cast<int>(reduce_total_length)});
 
     if(!reduce_max.IsSupportedArgument(reduce_max_argument_ptr.get()))
     {
@@ -292,17 +323,17 @@ int main(int argc, char* argv[])
     auto reduce_max_invoker_ptr = reduce_max.MakeInvokerPointer();
     reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat);
 
-    // do broadcast sub
+    // do broadcast sub and exp
     auto broadcastSubExp = DeviceElementwiseInstance{};
     auto broadcastSubExp_argument_ptr =
         broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(),
-                                         c_m_n_max_device_buf.GetDeviceBuffer(),
-                                         d_m_n_device_buf.GetDeviceBuffer(),
-                                         {M, N},
-                                         {StrideC, 1},
-                                         {0, 1},
-                                         {StrideC, 1},
-                                         Sub_Exp{});
+                                            c_n_max_device_buf.GetDeviceBuffer(),
+                                            exp_m_n_device_buf.GetDeviceBuffer(),
+                                            {M, N},
+                                            {StrideC, 1},
+                                            {0, 1},
+                                            {StrideC, 1},
+                                            Sub_Exp{});
 
     if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get()))
     {
@@ -313,7 +344,36 @@ int main(int argc, char* argv[])
     auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer();
     broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat);
 
-    // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv
+    // do reduce sum - denominator of softmax
+    auto reduce_sum    = DeviceReduceSumInstance{};
+    auto reduce_sum_workaspace_size = reduce_sum.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
+    DeviceMem reduce_sum_workaspace_device_buf(reduce_sum_workaspace_size);
+
+    auto reduce_sum_argument_ptr = reduce_sum.MakeArgumentPointer(
+        c_m_n_shape,
+        c_m_n_stride,
+        reduce_n_shape,
+        reduce_n_stride,
+        reduceDims,
+        1,
+        0,
+        exp_m_n_device_buf.GetDeviceBuffer(),
+        exp_n_sum_device_buf.GetDeviceBuffer(),
+        indices_device_buf.GetDeviceBuffer(),
+        reduce_sum_workaspace_device_buf.GetDeviceBuffer(),
+        ReduceSumInElementwiseOperation{static_cast<int>(reduce_total_length)},
+        ReduceSumAccElementwiseOperation{static_cast<int>(reduce_total_length)});
+
+    if(!reduce_sum.IsSupportedArgument(reduce_sum_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the DeviceReduce instance, exiting!");
+    };
+
+    auto reduce_sum_invoker_ptr = reduce_sum.MakeInvokerPointer();
+    reduce_sum_invoker_ptr->Run(reduce_sum_argument_ptr.get(), nrepeat);
+
+    // TODO - Need BroadcastDiv
     // TODO = do_verification
     (void)do_verification;
     return 0;

From 6a781e517640de05eb63b2f269585ff4c7b87a39 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 13 Apr 2022 20:04:08 +0000
Subject: [PATCH 11/28] Add broadcast div, the final step of softmax

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 51 ++++++++++++++++---
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 0e86e55c1b4..f11205f6437 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -155,9 +155,21 @@ struct Sub_Exp
     }
 };
 
-using DeviceElementwiseInstance = ck::tensor_operation::device::
+struct Div
+{
+    __host__ __device__ constexpr void
+    operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
+    {
+        dst = src1 / src2;
+    }
+};
+
+using DeviceElementwiseSubExpInstance = ck::tensor_operation::device::
     DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub_Exp, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
 
+using DeviceElementwiseDivInstance = ck::tensor_operation::device::
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, Div, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
+
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
 
@@ -230,10 +242,11 @@ int main(int argc, char* argv[])
     Tensor<CDataType> exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> exp_n_sum(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
                                 std::vector<std::size_t>({1}));
+    Tensor<CDataType> softmax_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
-    const auto c_m_n_shape  = ck::to_int_vector(c_m_n.mDesc.GetLengths());
-    const auto c_m_n_stride  = ck::to_int_vector(c_m_n.mDesc.GetStrides());
-    const auto reduce_n_shape = ck::to_int_vector(c_n_max.mDesc.GetLengths());
+    const auto c_m_n_shape     = ck::to_int_vector(c_m_n.mDesc.GetLengths());
+    const auto c_m_n_stride    = ck::to_int_vector(c_m_n.mDesc.GetStrides());
+    const auto reduce_n_shape  = ck::to_int_vector(c_n_max.mDesc.GetLengths());
     const auto reduce_n_stride = ck::to_int_vector(c_n_max.mDesc.GetStrides());
 
     size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_n_max.mDesc.GetElementSize();
@@ -244,6 +257,7 @@ int main(int argc, char* argv[])
     std::cout << "c_n_max: " << c_n_max.mDesc << std::endl;
     std::cout << "exp_m_n: " << exp_m_n.mDesc << std::endl;
     std::cout << "exp_n_sum: " << exp_n_sum.mDesc << std::endl;
+    std::cout << "softmax_m_n: " << softmax_m_n.mDesc << std::endl;
 
     switch(init_method)
     {
@@ -264,6 +278,7 @@ int main(int argc, char* argv[])
     DeviceMem indices_device_buf(0);
     DeviceMem exp_m_n_device_buf(sizeof(CDataType) * exp_m_n.mDesc.GetElementSpace());
     DeviceMem exp_n_sum_device_buf(sizeof(CDataType) * exp_n_sum.mDesc.GetElementSpace());
+    DeviceMem softmax_m_n_device_buf(sizeof(CDataType) * softmax_m_n.mDesc.GetElementSpace());
 
     a_m_k_device_buf.ToDevice(a_m_k.mData.data());
     b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -295,7 +310,7 @@ int main(int argc, char* argv[])
     gemm_invoker.Run(gemm_argument, nrepeat);
 
     // do reduce max
-    auto reduce_max    = DeviceReduceMaxInstance{};
+    auto reduce_max                 = DeviceReduceMaxInstance{};
     auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
     DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size);
 
@@ -324,7 +339,7 @@ int main(int argc, char* argv[])
     reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat);
 
     // do broadcast sub and exp
-    auto broadcastSubExp = DeviceElementwiseInstance{};
+    auto broadcastSubExp = DeviceElementwiseSubExpInstance{};
     auto broadcastSubExp_argument_ptr =
         broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(),
                                             c_n_max_device_buf.GetDeviceBuffer(),
@@ -345,7 +360,7 @@ int main(int argc, char* argv[])
     broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat);
 
     // do reduce sum - denominator of softmax
-    auto reduce_sum    = DeviceReduceSumInstance{};
+    auto reduce_sum                 = DeviceReduceSumInstance{};
     auto reduce_sum_workaspace_size = reduce_sum.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
     DeviceMem reduce_sum_workaspace_device_buf(reduce_sum_workaspace_size);
 
@@ -373,7 +388,27 @@ int main(int argc, char* argv[])
     auto reduce_sum_invoker_ptr = reduce_sum.MakeInvokerPointer();
     reduce_sum_invoker_ptr->Run(reduce_sum_argument_ptr.get(), nrepeat);
 
-    // TODO - Need BroadcastDiv
+    // do broadcast div
+    auto broadcastDiv = DeviceElementwiseDivInstance{};
+    auto broadcastDiv_argument_ptr =
+        broadcastDiv.MakeArgumentPointer(exp_m_n_device_buf.GetDeviceBuffer(),
+                                         exp_n_sum_device_buf.GetDeviceBuffer(),
+                                         softmax_m_n_device_buf.GetDeviceBuffer(),
+                                         {M, N},
+                                         {StrideC, 1},
+                                         {0, 1},
+                                         {StrideC, 1},
+                                         Div{});
+
+    if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get()))
+    {
+        throw std::runtime_error("The runtime parameters seems not supported by the "
+                                 "DeviceElementwise_2D instance, exiting!");
+    };
+
+    auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer();
+    broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat);
+
     // TODO = do_verification
     (void)do_verification;
     return 0;

From dba65b1c71197d63688c75fb0290142b7a0f30e4 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Thu, 14 Apr 2022 19:18:34 +0000
Subject: [PATCH 12/28] Rewrite the gridwise_elementwise_ 2d as 1d version

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |   4 +-
 .../gpu/device/device_elementwise_2d.hpp      | 111 ++++++------
 .../gpu/grid/gridwise_elementwise_1d.hpp      | 149 ++++++++++++++++
 .../gpu/grid/gridwise_elementwise_2d.hpp      | 166 ------------------
 4 files changed, 200 insertions(+), 230 deletions(-)
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
 delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index f11205f6437..9f2cbe1e625 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -165,10 +165,10 @@ struct Div
 };
 
 using DeviceElementwiseSubExpInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub_Exp, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub_Exp, 256, 32, 8>;
 
 using DeviceElementwiseDivInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, Div, 16, 16, 8, 8, 1, 1, 1, 1, 1>;
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, Div, 256, 32, 8>;
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
index db9974b4a00..182929f63cd 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -4,7 +4,7 @@
 
 #include "device.hpp"
 #include "device_elementwise.hpp"
-#include "gridwise_elementwise_2d.hpp"
+#include "gridwise_elementwise_1d.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -14,48 +14,40 @@ template <typename ADataType,
           typename BDataType,
           typename CDataType,
           typename ElementwiseFunctor,
-          index_t MThreadPerBlock,
-          index_t NThreadPerBlock,
-          index_t MThreadTileSize,
-          index_t NThreadTileSize,
-          index_t AThreadTransferSrcVectorDim,
-          index_t AThreadTransferSrcScalarPerVector,
-          index_t BThreadTransferSrcVectorDim,
-          index_t BThreadTransferSrcScalarPerVector,
-          index_t CThreadTransferSrcScalarPerVector>
+          index_t ThreadPerBlock,
+          index_t ThreadTileSize,
+          index_t ScalarPerVector>
 struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 {
-    static_assert(NThreadTileSize % AThreadTransferSrcScalarPerVector == 0 &&
-                  NThreadTileSize % BThreadTransferSrcScalarPerVector == 0);
+    static_assert(ThreadTileSize % ScalarPerVector == 0);
+    static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;
+    static constexpr auto I0           = Number<0>{};
 
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    static auto Make2dDescriptor_M_N(const std::vector<int>& shape, const std::vector<int>& stride)
+    static auto MakeDescriptor_M0(const std::vector<int>& shape, const std::vector<int>& stride)
     {
-        return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]),
-                                            make_tuple(stride[0], stride[1]));
+        const int m = shape[0];
+        const int n = shape[1];
+
+        // 2d desc - [m, n]
+        const auto desc_m_n =
+            make_naive_tensor_descriptor(make_tuple(m, n), make_tuple(stride[0], stride[1]));
+
+        // 1d desc - [m * n]
+        return transform_tensor_descriptor(desc_m_n,
+                                           make_tuple(make_merge_transform(make_tuple(m, n))),
+                                           make_tuple(Sequence<0, 1>{}),
+                                           make_tuple(Sequence<0>{}));
     }
 
-    static constexpr index_t BlockSize   = MThreadPerBlock * NThreadPerBlock;
-    static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize;
-    static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize;
-
-    using GridDesc_M_N    = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1}));
-    using GridwiseEltwise = GridwiseElementwise_2D<ADataType,
+    using GridDesc_M0     = decltype(MakeDescriptor_M0({1, 1}, {1, 1}));
+    using GridwiseEltwise = GridwiseElementwise_1D<ADataType,
                                                    BDataType,
                                                    CDataType,
-                                                   GridDesc_M_N,
+                                                   GridDesc_M0,
                                                    ElementwiseFunctor,
-                                                   MThreadPerBlock,
-                                                   NThreadPerBlock,
-                                                   MThreadTileSize,
-                                                   NThreadTileSize,
-                                                   AThreadTransferSrcVectorDim,
-                                                   AThreadTransferSrcScalarPerVector,
-                                                   BThreadTransferSrcVectorDim,
-                                                   BThreadTransferSrcScalarPerVector,
-                                                   CThreadTransferSrcScalarPerVector>;
+                                                   ThreadPerBlock,
+                                                   ThreadTileSize,
+                                                   ScalarPerVector>;
 
     struct Argument : public BaseArgument
     {
@@ -70,9 +62,9 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
             : p_a_(p_a),
               p_b_(p_b),
               p_c_(p_c),
-              a_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_a)),
-              b_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_b)),
-              c_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_c)),
+              a_grid_desc_m0_(MakeDescriptor_M0(shape, stride_a)),
+              b_grid_desc_m0_(MakeDescriptor_M0(shape, stride_b)),
+              c_grid_desc_m0_(MakeDescriptor_M0(shape, stride_c)),
               functor_(functor)
         {
         }
@@ -80,47 +72,42 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
         const ADataType* p_a_;
         const BDataType* p_b_;
         CDataType* p_c_;
-        GridDesc_M_N a_grid_desc_m_n_;
-        GridDesc_M_N b_grid_desc_m_n_;
-        GridDesc_M_N c_grid_desc_m_n_;
+        GridDesc_M0 a_grid_desc_m0_;
+        GridDesc_M0 b_grid_desc_m0_;
+        GridDesc_M0 c_grid_desc_m0_;
         ElementwiseFunctor functor_;
     };
 
     struct Invoker : public BaseInvoker
     {
-        index_t CalculateGridSize(const GridDesc_M_N& grid_desc_m_n)
+        index_t CalculateGridSize(const GridDesc_M0& grid_desc_m0)
         {
-            const auto M = grid_desc_m_n.GetLength(I0);
-            const auto N = grid_desc_m_n.GetLength(I1);
-
-            assert(M % M_BlockTileSize == 0);
-            assert(N % N_BlockTileSize == 0);
-
-            return (M / M_BlockTileSize) * (N / N_BlockTileSize);
+            const auto gridTileSize = grid_desc_m0.GetLength(I0);
+            return gridTileSize / BlockTileSize;
         }
 
         float Run(const Argument& arg, int nrepeat = 1)
         {
-            const auto kernel      = kernel_elementwise_2d<GridwiseEltwise,
+            const auto kernel      = kernel_elementwise_1d<GridwiseEltwise,
                                                       ADataType,
                                                       BDataType,
                                                       CDataType,
-                                                      GridDesc_M_N,
+                                                      GridDesc_M0,
                                                       ElementwiseFunctor>;
             float avgTime          = 0;
-            const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_);
+            const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m0_);
             if(nrepeat == 0)
             {
                 launch_kernel(kernel,
                               dim3(gridSize),
-                              dim3(BlockSize),
+                              dim3(ThreadPerBlock),
                               0,
                               arg.p_a_,
                               arg.p_b_,
                               arg.p_c_,
-                              arg.a_grid_desc_m_n_,
-                              arg.b_grid_desc_m_n_,
-                              arg.c_grid_desc_m_n_,
+                              arg.a_grid_desc_m0_,
+                              arg.b_grid_desc_m0_,
+                              arg.c_grid_desc_m0_,
                               arg.functor_);
             }
             else
@@ -128,14 +115,14 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
                 avgTime = launch_and_time_kernel(kernel,
                                                  nrepeat,
                                                  dim3(gridSize),
-                                                 dim3(BlockSize),
+                                                 dim3(ThreadPerBlock),
                                                  0,
                                                  arg.p_a_,
                                                  arg.p_b_,
                                                  arg.p_c_,
-                                                 arg.a_grid_desc_m_n_,
-                                                 arg.b_grid_desc_m_n_,
-                                                 arg.c_grid_desc_m_n_,
+                                                 arg.a_grid_desc_m0_,
+                                                 arg.b_grid_desc_m0_,
+                                                 arg.c_grid_desc_m0_,
                                                  arg.functor_);
             }
             return avgTime;
@@ -154,10 +141,10 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
         if(pArg == nullptr)
             return false;
 
-        const auto M = pArg->c_grid_desc_m_n_.GetLength(I0);
-        const auto N = pArg->c_grid_desc_m_n_.GetLength(I1);
+        // m * n
+        const auto m0 = pArg->c_grid_desc_m0_.GetLength(I0);
 
-        if(M % M_BlockTileSize != 0 && N % N_BlockTileSize != 0)
+        if(m0 % BlockTileSize != 0)
             return false;
 
         return true;
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
new file mode 100644
index 00000000000..cdd86e0e950
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
@@ -0,0 +1,149 @@
+#pragma once
+
+#include "cluster_descriptor.hpp"
+#include "data_type.hpp"
+#include "element_wise_operation.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename GridwiseEltwise,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GridDesc_M0,
+          typename ElementwiseFunctor>
+__global__ void kernel_elementwise_1d(const ADataType* __restrict__ p_a_global,
+                                      const BDataType* __restrict__ p_b_global,
+                                      CDataType* __restrict__ p_c_global,
+                                      const GridDesc_M0 a_grid_desc_m0,
+                                      const GridDesc_M0 b_grid_desc_m0,
+                                      const GridDesc_M0 c_grid_desc_m0,
+                                      const ElementwiseFunctor functor)
+{
+    GridwiseEltwise::Run(p_a_global,
+                         p_b_global,
+                         p_c_global,
+                         a_grid_desc_m0,
+                         b_grid_desc_m0,
+                         c_grid_desc_m0,
+                         functor);
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GridDesc_M0,
+          typename ElementwiseFunctor,
+          index_t ThreadPerBlock,
+          index_t ThreadTileSize,
+          index_t ScalarPerVector>
+struct GridwiseElementwise_1D
+{
+    static constexpr auto I0           = Number<0>{};
+    static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;
+    static constexpr auto thread_desc_M0 =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
+
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    static __device__ __host__ auto CalculateElementwiseIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t block_id  = get_block_1d_id();
+
+        return make_multi_index(block_id * BlockTileSize + thread_id * ScalarPerVector);
+    }
+
+    __device__ static void Run(const ADataType* __restrict__ p_a_global,
+                               const BDataType* __restrict__ p_b_global,
+                               CDataType* __restrict__ p_c_global,
+                               const GridDesc_M0 a_grid_desc_m0,
+                               const GridDesc_M0 b_grid_desc_m0,
+                               const GridDesc_M0 c_grid_desc_m0,
+                               const ElementwiseFunctor functor)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_grid_desc_m0.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_grid_desc_m0.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_grid_desc_m0.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, CDataType, ScalarPerVector, true> c_thread_buf;
+
+        const auto thread_to_global_offset = CalculateElementwiseIndex();
+
+        auto a_global_load =
+            ThreadwiseTensorSliceTransfer_v2<ADataType,
+                                             ADataType,
+                                             GridDesc_M0,
+                                             decltype(thread_desc_M0),
+                                             Sequence<ScalarPerVector>, // SliceLengths
+                                             Sequence<0>,               // DimAccessOrder
+                                             0,                         // SrcVectorDim
+                                             ScalarPerVector,
+                                             1, // SrcScalarStrideInVector
+                                             false>{a_grid_desc_m0, thread_to_global_offset};
+
+        auto b_global_load =
+            ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                             BDataType,
+                                             GridDesc_M0,
+                                             decltype(thread_desc_M0),
+                                             Sequence<ScalarPerVector>, // SliceLengths
+                                             Sequence<0>,               // DimAccessOrder
+                                             0,                         // SrcVectorDim
+                                             ScalarPerVector,
+                                             1, // SrcScalarStrideInVector
+                                             false>{b_grid_desc_m0, thread_to_global_offset};
+
+        auto c_global_write =
+            ThreadwiseTensorSliceTransfer_v1r3<CDataType,
+                                               CDataType,
+                                               decltype(thread_desc_M0),
+                                               GridDesc_M0,
+                                               PassThrough,
+                                               Sequence<ScalarPerVector>, // SliceLengths
+                                               Sequence<0>,               // DimAccessOrder
+                                               0,                         // DstVectorDim
+                                               ScalarPerVector,
+                                               InMemoryDataOperationEnum::Set,
+                                               1, // DstScalarStrideInVector
+                                               false>{
+                c_grid_desc_m0, thread_to_global_offset, PassThrough{}};
+
+        int num_iter                         = ThreadTileSize / ScalarPerVector;
+        constexpr auto thread_to_global_step = make_multi_index(ThreadPerBlock * ScalarPerVector);
+        do
+        {
+            // read and process ScalarPerVector elements
+            a_global_load.Run(
+                a_grid_desc_m0, a_global_buf, thread_desc_M0, make_tuple(I0), a_thread_buf);
+
+            b_global_load.Run(
+                b_grid_desc_m0, b_global_buf, thread_desc_M0, make_tuple(I0), b_thread_buf);
+
+            static_for<0, ScalarPerVector, 1>{}([&](auto m) {
+                constexpr auto offset = thread_desc_M0.CalculateOffset(make_tuple(m));
+                functor(c_thread_buf(Number<offset>{}),
+                        a_thread_buf(Number<offset>{}),
+                        b_thread_buf(Number<offset>{}));
+            });
+
+            c_global_write.Run(thread_desc_M0,
+                               make_tuple(I0), // SrcSliceOriginIdx
+                               c_thread_buf,
+                               c_grid_desc_m0,
+                               c_global_buf);
+
+            a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, thread_to_global_step);
+            b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, thread_to_global_step);
+            c_global_write.MoveDstSliceWindow(c_grid_desc_m0, thread_to_global_step);
+        } while(--num_iter);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
deleted file mode 100644
index 713411c63a4..00000000000
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-#pragma once
-
-#include "cluster_descriptor.hpp"
-#include "data_type.hpp"
-#include "element_wise_operation.hpp"
-#include "threadwise_tensor_slice_transfer.hpp"
-
-namespace ck {
-
-template <typename GridwiseEltwise,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename GridDesc_M_N,
-          typename ElementwiseFunctor>
-__global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global,
-                                      const BDataType* __restrict__ p_b_global,
-                                      CDataType* __restrict__ p_c_global,
-                                      const GridDesc_M_N a_grid_desc_m_k,
-                                      const GridDesc_M_N b_grid_desc_m_k,
-                                      const GridDesc_M_N c_grid_desc_m_k,
-                                      const ElementwiseFunctor functor)
-{
-    GridwiseEltwise::Run(p_a_global,
-                         p_b_global,
-                         p_c_global,
-                         a_grid_desc_m_k,
-                         b_grid_desc_m_k,
-                         c_grid_desc_m_k,
-                         functor);
-}
-
-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename GridDesc_M_N,
-          typename ElementwiseFunctor,
-          index_t MThreadPerBlock,
-          index_t NThreadPerBlock,
-          index_t MThreadTileSize,
-          index_t NThreadTileSize,
-          index_t AThreadTransferSrcVectorDim,
-          index_t AThreadTransferSrcScalarPerVector,
-          index_t BThreadTransferSrcVectorDim,
-          index_t BThreadTransferSrcScalarPerVector,
-          index_t CThreadTransferSrcScalarPerVector>
-struct GridwiseElementwise_2D
-{
-    static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<MThreadTileSize>{}, Number<NThreadTileSize>{}));
-
-    using PassThrough       = tensor_operation::element_wise::PassThrough;
-    using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N);
-
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize;
-    static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize;
-
-    static __device__ __host__ auto CalculateElementwiseIndex(const GridDesc_M_N& grid_desc_m_n)
-    {
-        const index_t thread_id = get_thread_local_1d_id();
-        const index_t block_id  = get_block_1d_id();
-
-        const index_t M              = grid_desc_m_n.GetLength(I0);
-        const index_t gridSize_m     = M / M_BlockTileSize;
-        const index_t block_2d_idx_m = block_id % gridSize_m;
-        const index_t block_2d_idx_n = block_id / gridSize_m;
-
-        constexpr auto thread_desc =
-            make_cluster_descriptor(Sequence<MThreadPerBlock, NThreadPerBlock>{}, Sequence<1, 0>{});
-
-        const auto thread_2d_idx = thread_desc.CalculateBottomIndex(make_multi_index(thread_id));
-
-        return make_multi_index(
-            block_2d_idx_m * M_BlockTileSize + thread_2d_idx[I0] * MThreadTileSize,
-            block_2d_idx_n * N_BlockTileSize + thread_2d_idx[I1] * NThreadTileSize);
-    }
-
-    __device__ static void Run(const ADataType* __restrict__ p_a_global,
-                               const BDataType* __restrict__ p_b_global,
-                               CDataType* __restrict__ p_c_global,
-                               const GridDesc_M_N a_grid_desc_m_n,
-                               const GridDesc_M_N b_grid_desc_m_n,
-                               const GridDesc_M_N c_grid_desc_m_n,
-                               const ElementwiseFunctor functor)
-    {
-        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_global, a_grid_desc_m_n.GetElementSpaceSize());
-        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_global, b_grid_desc_m_n.GetElementSpaceSize());
-        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_c_global, c_grid_desc_m_n.GetElementSpaceSize());
-
-        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, MThreadTileSize * NThreadTileSize, true>
-            a_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, MThreadTileSize * NThreadTileSize, true>
-            b_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, CDataType, MThreadTileSize * NThreadTileSize, true>
-            c_thread_buf;
-
-        const auto a_global_load_offset = CalculateElementwiseIndex(a_grid_desc_m_n);
-        const auto b_global_load_offset = CalculateElementwiseIndex(b_grid_desc_m_n);
-
-        auto a_global_load = ThreadwiseTensorSliceTransfer_v2<
-            ADataType,
-            ADataType,
-            GridDesc_M_N,
-            decltype(thread_buf_desc_M_N),
-            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
-            Sequence<0, 1>,                             // DimAccessOrder
-            AThreadTransferSrcVectorDim,
-            AThreadTransferSrcScalarPerVector,
-            1, // SrcScalarStrideInVector
-            false>{a_grid_desc_m_n, a_global_load_offset};
-
-        auto b_global_load = ThreadwiseTensorSliceTransfer_v2<
-            BDataType,
-            BDataType,
-            GridDesc_M_N,
-            decltype(thread_buf_desc_M_N),
-            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
-            Sequence<0, 1>,                             // DimAccessOrder
-            BThreadTransferSrcVectorDim,
-            BThreadTransferSrcScalarPerVector,
-            1, // SrcScalarStrideInVector
-            false>{b_grid_desc_m_n, b_global_load_offset};
-
-        a_global_load.Run(
-            a_grid_desc_m_n, a_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), a_thread_buf);
-
-        b_global_load.Run(
-            b_grid_desc_m_n, b_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), b_thread_buf);
-
-        static_for<0, MThreadTileSize, 1>{}([&](auto m) {
-            static_for<0, NThreadTileSize, 1>{}([&](auto n) {
-                constexpr auto offset = thread_buf_desc_M_N.CalculateOffset(make_tuple(m, n));
-                functor(c_thread_buf(Number<offset>{}),
-                        a_thread_buf(Number<offset>{}),
-                        b_thread_buf(Number<offset>{}));
-            });
-        });
-
-        // TODO - global write
-        const auto c_global_write_offset = CalculateElementwiseIndex(c_grid_desc_m_n);
-        auto c_global_write              = ThreadwiseTensorSliceTransfer_v1r3<
-            CDataType,
-            CDataType,
-            decltype(thread_buf_desc_M_N),
-            GridDesc_M_N,
-            PassThrough,
-            Sequence<MThreadTileSize, NThreadTileSize>, // SliceLengths
-            Sequence<0, 1>,                             // DimAccessOrder
-            1,                                          // DstVectorDim
-            CThreadTransferSrcScalarPerVector,          // DstScalarPerVector
-            InMemoryDataOperationEnum::Set,             // DstInMemOp
-            1,                                          // DstScalarStrideInVector
-            false>{c_grid_desc_m_n, c_global_write_offset, PassThrough{}};
-
-        c_global_write.Run(
-            thread_buf_desc_M_N, make_tuple(I0, I0), c_thread_buf, c_grid_desc_m_n, c_global_buf);
-    }
-};
-
-} // namespace ck

From fe6595020b0ce31f7987956119e2427079d1b336 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Fri, 15 Apr 2022 12:29:04 +0000
Subject: [PATCH 13/28] Add verication of softmax

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 122 ++++++++++++++++--
 1 file changed, 110 insertions(+), 12 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 9f2cbe1e625..922c914f928 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -5,11 +5,14 @@
 #include <stdlib.h>
 #include <half.hpp>
 #include <math.h>
+#include "check_err.hpp"
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "host_reduce_util.hpp"
+#include "host_reduction.hpp"
+
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
 #include "device_gemm_xdl_c_shuffle.hpp"
@@ -89,9 +92,7 @@ constexpr int Rank                       = 2;
 constexpr int NumReduceDim               = 1;
 constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX;
 constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD;
-constexpr ck::NanPropagation NanOpt      = ck::NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-// constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES;
+constexpr bool ReducePropagateNan        = false;
 using ReduceMaxOp = typename ck::reduce_binary_operator<CDataType, ReduceMaxId>::opType;
 using ReduceSumOp = typename ck::reduce_binary_operator<CDataType, ReduceSumId>::opType;
 using ReduceMaxInElementwiseOperation =
@@ -112,7 +113,7 @@ using DeviceReduceMaxInstance =
                                                         ReduceMaxOp,
                                                         ReduceMaxInElementwiseOperation,
                                                         ReduceMaxAccElementwiseOperation,
-                                                        PropagateNan,
+                                                        ReducePropagateNan,
                                                         false,
                                                         256,
                                                         4,
@@ -132,7 +133,7 @@ using DeviceReduceSumInstance =
                                                         ReduceSumOp,
                                                         ReduceSumInElementwiseOperation,
                                                         ReduceSumAccElementwiseOperation,
-                                                        PropagateNan,
+                                                        ReducePropagateNan,
                                                         false,
                                                         256,
                                                         4,
@@ -170,9 +171,47 @@ using DeviceElementwiseSubExpInstance = ck::tensor_operation::device::
 using DeviceElementwiseDivInstance = ck::tensor_operation::device::
     DeviceElementwise_2D<CDataType, CDataType, CDataType, Div, 256, 32, 8>;
 
-using ReferenceGemmInstance = ck::tensor_operation::host::
+using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
 
+using HostReduceMaxInstance = ReductionHost<CDataType,
+                                            CDataType,
+                                            CDataType,
+                                            ReduceMaxId,
+                                            Rank,
+                                            NumReduceDim,
+                                            ReducePropagateNan,
+                                            false>;
+
+using HostReduceSumInstance = ReductionHost<CDataType,
+                                            CDataType,
+                                            CDataType,
+                                            ReduceSumId,
+                                            Rank,
+                                            NumReduceDim,
+                                            ReducePropagateNan,
+                                            false>;
+
+template <typename HostTensorA,
+          typename HostTensorB,
+          typename HostTensorC,
+          typename Functor,
+          int broadcastDim>
+void host_broadcast2D(
+    HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, int N, Functor functor)
+{
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            if constexpr(broadcastDim == 1)
+                functor(C(m, n), A(m, n), B(n));
+            else
+                functor(C(m, n), A(m, n), B(m));
+        }
+    }
+}
+
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
@@ -189,7 +228,6 @@ int main(int argc, char* argv[])
     ck::index_t StrideC = 4096;
 
     const std::vector<int> reduceDims{0};
-    const std::vector<int> reduceInvariantDims{1};
 
     if(argc == 4)
     {
@@ -237,8 +275,8 @@ int main(int argc, char* argv[])
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<int> c_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
-                        std::vector<std::size_t>({1}));
+    Tensor<CDataType> c_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                              std::vector<std::size_t>({1}));
     Tensor<CDataType> exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> exp_n_sum(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
                                 std::vector<std::size_t>({1}));
@@ -370,8 +408,8 @@ int main(int argc, char* argv[])
         reduce_n_shape,
         reduce_n_stride,
         reduceDims,
-        1,
-        0,
+        1, // alpha
+        0, // beta
         exp_m_n_device_buf.GetDeviceBuffer(),
         exp_n_sum_device_buf.GetDeviceBuffer(),
         indices_device_buf.GetDeviceBuffer(),
@@ -410,6 +448,66 @@ int main(int argc, char* argv[])
     broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat);
 
     // TODO = do_verification
-    (void)do_verification;
+    if(do_verification)
+    {
+        std::cout << "verification..." << std::endl;
+        const std::vector<int> reduceInvariantDims{1};
+        Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        Tensor<CDataType> host_c_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                                       std::vector<std::size_t>({1}));
+        Tensor<int> host_indices(host_c_n_max.mDesc.GetLengths());
+        Tensor<CDataType> host_exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        Tensor<CDataType> host_exp_n_sum(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
+                                         std::vector<std::size_t>({1}));
+        Tensor<CDataType> host_softmax_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+        auto host_gemm          = HostGemmInstance{};
+        auto host_gemm_invoker  = host_gemm.MakeInvoker();
+        auto host_gemm_argument = host_gemm.MakeArgument(
+            a_m_k, b_k_n, host_c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        auto host_reduce_max = HostReduceMaxInstance{
+            host_c_m_n.mDesc, host_c_n_max.mDesc, reduceInvariantDims, reduceDims};
+
+        auto host_reduce_sum = HostReduceSumInstance{
+            host_exp_m_n.mDesc, host_exp_n_sum.mDesc, reduceInvariantDims, reduceDims};
+
+        host_gemm_invoker.Run(host_gemm_argument);
+        host_reduce_max.Run(1, // alpha
+                            reinterpret_cast<const CDataType*>(host_c_m_n.mData.data()),
+                            0, // beta
+                            reinterpret_cast<CDataType*>(host_c_n_max.mData.data()),
+                            host_indices.mData.data());
+
+        host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Sub_Exp, 1>(
+            host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{});
+
+        host_reduce_sum.Run(1, // alpha
+                            reinterpret_cast<const CDataType*>(host_exp_m_n.mData.data()),
+                            0, // beta
+                            reinterpret_cast<CDataType*>(host_exp_n_sum.mData.data()),
+                            host_indices.mData.data());
+
+        host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Div, 1>(
+            host_softmax_m_n, host_exp_m_n, host_exp_n_sum, M, N, Div{});
+
+        c_m_n_device_buf.FromDevice(c_m_n.mData.data());
+        c_n_max_device_buf.FromDevice(c_n_max.mData.data());
+        exp_m_n_device_buf.FromDevice(exp_m_n.mData.data());
+        exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data());
+        softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data());
+
+        bool result = true;
+        if (result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))
+            std::cout << "[PASS] - c_m_n" << std::endl;
+        if (result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData))
+            std::cout << "[PASS] - c_n_max" << std::endl;
+        if (result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData))
+            std::cout << "[PASS] - exp_m_n" << std::endl;
+        if (result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData))
+            std::cout << "[PASS] - exp_n_sum" << std::endl;
+        if (result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData))
+            std::cout << "[PASS] - softmax_m_n" << std::endl;
+    }
     return 0;
 }

From e83b22e052e386f553ce2ddbedd66deb0f7ee167 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 18 Apr 2022 10:44:29 +0000
Subject: [PATCH 14/28] [What] Use half_float::half instead of ck::half_t for
 host reduction [Why]  std::numeric_limits<_Float16>::lowest() will return
 zero

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 922c914f928..3b164b578c9 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -41,6 +41,10 @@ using BDataType   = F16;
 using CDataType   = F16;
 using AccDataType = F32;
 
+// CAUSION - host reduce_max will call numeric_limits<ck::half_t>::lowest()
+// However, numeric_limits<ck::half_t>::lowest() will return zero. So, used half_float::half instead
+using HostReduceDataType = half_float::half;
+
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
 using CLayout = ck::tensor_layout::gemm::RowMajor;
@@ -174,18 +178,18 @@ using DeviceElementwiseDivInstance = ck::tensor_operation::device::
 using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
 
-using HostReduceMaxInstance = ReductionHost<CDataType,
-                                            CDataType,
-                                            CDataType,
+using HostReduceMaxInstance = ReductionHost<HostReduceDataType,
+                                            HostReduceDataType,
+                                            HostReduceDataType,
                                             ReduceMaxId,
                                             Rank,
                                             NumReduceDim,
                                             ReducePropagateNan,
                                             false>;
 
-using HostReduceSumInstance = ReductionHost<CDataType,
-                                            CDataType,
-                                            CDataType,
+using HostReduceSumInstance = ReductionHost<HostReduceDataType,
+                                            HostReduceDataType,
+                                            HostReduceDataType,
                                             ReduceSumId,
                                             Rank,
                                             NumReduceDim,
@@ -474,18 +478,18 @@ int main(int argc, char* argv[])
 
         host_gemm_invoker.Run(host_gemm_argument);
         host_reduce_max.Run(1, // alpha
-                            reinterpret_cast<const CDataType*>(host_c_m_n.mData.data()),
+                            reinterpret_cast<const HostReduceDataType*>(host_c_m_n.mData.data()),
                             0, // beta
-                            reinterpret_cast<CDataType*>(host_c_n_max.mData.data()),
+                            reinterpret_cast<HostReduceDataType*>(host_c_n_max.mData.data()),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Sub_Exp, 1>(
             host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{});
 
         host_reduce_sum.Run(1, // alpha
-                            reinterpret_cast<const CDataType*>(host_exp_m_n.mData.data()),
+                            reinterpret_cast<const HostReduceDataType*>(host_exp_m_n.mData.data()),
                             0, // beta
-                            reinterpret_cast<CDataType*>(host_exp_n_sum.mData.data()),
+                            reinterpret_cast<HostReduceDataType*>(host_exp_n_sum.mData.data()),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Div, 1>(
@@ -498,15 +502,15 @@ int main(int argc, char* argv[])
         softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data());
 
         bool result = true;
-        if (result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))
+        if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))
             std::cout << "[PASS] - c_m_n" << std::endl;
-        if (result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData))
+        if(result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData))
             std::cout << "[PASS] - c_n_max" << std::endl;
-        if (result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData))
+        if(result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData))
             std::cout << "[PASS] - exp_m_n" << std::endl;
-        if (result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData))
+        if(result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData))
             std::cout << "[PASS] - exp_n_sum" << std::endl;
-        if (result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData))
+        if(result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData))
             std::cout << "[PASS] - softmax_m_n" << std::endl;
     }
     return 0;

From 21802fda18e7e659418642c5befd163932968896 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 18 Apr 2022 11:09:17 +0000
Subject: [PATCH 15/28] [What] Sync input of each host kernel and device kernel
 [Why] Prevent error propogation

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 3b164b578c9..2326168e1e7 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -455,6 +455,13 @@ int main(int argc, char* argv[])
     if(do_verification)
     {
         std::cout << "verification..." << std::endl;
+
+        c_m_n_device_buf.FromDevice(c_m_n.mData.data());
+        c_n_max_device_buf.FromDevice(c_n_max.mData.data());
+        exp_m_n_device_buf.FromDevice(exp_m_n.mData.data());
+        exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data());
+        softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data());
+
         const std::vector<int> reduceInvariantDims{1};
         Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
         Tensor<CDataType> host_c_n_max(std::vector<std::size_t>({static_cast<std::size_t>(N)}),
@@ -478,28 +485,22 @@ int main(int argc, char* argv[])
 
         host_gemm_invoker.Run(host_gemm_argument);
         host_reduce_max.Run(1, // alpha
-                            reinterpret_cast<const HostReduceDataType*>(host_c_m_n.mData.data()),
+                            reinterpret_cast<const HostReduceDataType*>(c_m_n.mData.data()),
                             0, // beta
                             reinterpret_cast<HostReduceDataType*>(host_c_n_max.mData.data()),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Sub_Exp, 1>(
-            host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{});
+            host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
 
         host_reduce_sum.Run(1, // alpha
-                            reinterpret_cast<const HostReduceDataType*>(host_exp_m_n.mData.data()),
+                            reinterpret_cast<const HostReduceDataType*>(exp_m_n.mData.data()),
                             0, // beta
                             reinterpret_cast<HostReduceDataType*>(host_exp_n_sum.mData.data()),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Div, 1>(
-            host_softmax_m_n, host_exp_m_n, host_exp_n_sum, M, N, Div{});
-
-        c_m_n_device_buf.FromDevice(c_m_n.mData.data());
-        c_n_max_device_buf.FromDevice(c_n_max.mData.data());
-        exp_m_n_device_buf.FromDevice(exp_m_n.mData.data());
-        exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data());
-        softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data());
+            host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{});
 
         bool result = true;
         if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))

From cf3266902d26e828d47b367838d45790a9014b22 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 20 Apr 2022 10:32:39 +0000
Subject: [PATCH 16/28] [What] Use F32 as the acc of reduce sum [Why] Prevent
 loss of precision

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 2326168e1e7..73556331b56 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -98,15 +98,15 @@ constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX;
 constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD;
 constexpr bool ReducePropagateNan        = false;
 using ReduceMaxOp = typename ck::reduce_binary_operator<CDataType, ReduceMaxId>::opType;
-using ReduceSumOp = typename ck::reduce_binary_operator<CDataType, ReduceSumId>::opType;
+using ReduceSumOp = typename ck::reduce_binary_operator<AccDataType, ReduceSumId>::opType;
 using ReduceMaxInElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::InElementwiseOperation;
 using ReduceMaxAccElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::AccElementwiseOperation;
 using ReduceSumInElementwiseOperation =
-    typename ck::reduce_unary_operator<CDataType, ReduceSumId, true, true>::InElementwiseOperation;
+    typename ck::reduce_unary_operator<AccDataType, ReduceSumId, true, true>::InElementwiseOperation;
 using ReduceSumAccElementwiseOperation =
-    typename ck::reduce_unary_operator<CDataType, ReduceSumId, true, true>::AccElementwiseOperation;
+    typename ck::reduce_unary_operator<AccDataType, ReduceSumId, true, true>::AccElementwiseOperation;
 
 using DeviceReduceMaxInstance =
     ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
@@ -130,7 +130,7 @@ using DeviceReduceMaxInstance =
 
 using DeviceReduceSumInstance =
     ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
-                                                        CDataType,
+                                                        AccDataType,
                                                         CDataType,
                                                         Rank,
                                                         NumReduceDim,
@@ -188,7 +188,7 @@ using HostReduceMaxInstance = ReductionHost<HostReduceDataType,
                                             false>;
 
 using HostReduceSumInstance = ReductionHost<HostReduceDataType,
-                                            HostReduceDataType,
+                                            AccDataType,
                                             HostReduceDataType,
                                             ReduceSumId,
                                             Rank,
@@ -504,15 +504,15 @@ int main(int argc, char* argv[])
 
         bool result = true;
         if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))
-            std::cout << "[PASS] - c_m_n" << std::endl;
+            std::cout << "[PASS] - gemm" << std::endl;
         if(result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData))
-            std::cout << "[PASS] - c_n_max" << std::endl;
+            std::cout << "[PASS] - reduce max" << std::endl;
         if(result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData))
-            std::cout << "[PASS] - exp_m_n" << std::endl;
+            std::cout << "[PASS] - broadcast sub + exp" << std::endl;
         if(result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData))
-            std::cout << "[PASS] - exp_n_sum" << std::endl;
+            std::cout << "[PASS] - reduce sum" << std::endl;
         if(result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData))
-            std::cout << "[PASS] - softmax_m_n" << std::endl;
+            std::cout << "[PASS] - broadcast div" << std::endl;
     }
     return 0;
 }

From 0f421d6fe75c4162e26ff83a3a78c32c8f281bc4 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 20 Apr 2022 10:57:41 +0000
Subject: [PATCH 17/28] [What] Add ComputeDataType to the eltwise kernel [Why]
 Similar to acc datatype, it increase precision

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 75 ++++++++++++-------
 .../gpu/device/device_elementwise_2d.hpp      |  2 +
 .../gpu/grid/gridwise_elementwise_1d.hpp      | 13 ++--
 3 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 73556331b56..0accded132e 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -36,10 +36,11 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-using ADataType   = F16;
-using BDataType   = F16;
-using CDataType   = F16;
-using AccDataType = F32;
+using ADataType              = F16;
+using BDataType              = F16;
+using CDataType              = F16;
+using AccDataType            = F32;
+using EltwiseComputeDataType = F32;
 
 // CAUSION - host reduce_max will call numeric_limits<ck::half_t>::lowest()
 // However, numeric_limits<ck::half_t>::lowest() will return zero. So, used half_float::half instead
@@ -103,10 +104,10 @@ using ReduceMaxInElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::InElementwiseOperation;
 using ReduceMaxAccElementwiseOperation =
     typename ck::reduce_unary_operator<CDataType, ReduceMaxId, true, true>::AccElementwiseOperation;
-using ReduceSumInElementwiseOperation =
-    typename ck::reduce_unary_operator<AccDataType, ReduceSumId, true, true>::InElementwiseOperation;
-using ReduceSumAccElementwiseOperation =
-    typename ck::reduce_unary_operator<AccDataType, ReduceSumId, true, true>::AccElementwiseOperation;
+using ReduceSumInElementwiseOperation = typename ck::
+    reduce_unary_operator<AccDataType, ReduceSumId, true, true>::InElementwiseOperation;
+using ReduceSumAccElementwiseOperation = typename ck::
+    reduce_unary_operator<AccDataType, ReduceSumId, true, true>::AccElementwiseOperation;
 
 using DeviceReduceMaxInstance =
     ck::tensor_operation::device::DeviceReduceBlockWise<CDataType,
@@ -150,30 +151,36 @@ using DeviceReduceSumInstance =
 
 struct Sub_Exp
 {
-    __host__ __device__ constexpr void
-    operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
+    __host__ __device__ constexpr void operator()(EltwiseComputeDataType& dst,
+                                                  const EltwiseComputeDataType& src1,
+                                                  const EltwiseComputeDataType& src2) const
     {
-        dst = src1 - src2;
-        // FIXME - use float16 exponential
-        float dst_f32 = static_cast<float>(dst);
-        dst           = static_cast<CDataType>(exp(dst_f32));
+        dst = exp(src1 - src2);
     }
 };
 
 struct Div
 {
-    __host__ __device__ constexpr void
-    operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const
+    __host__ __device__ constexpr void operator()(EltwiseComputeDataType& dst,
+                                                  const EltwiseComputeDataType& src1,
+                                                  const EltwiseComputeDataType& src2) const
     {
         dst = src1 / src2;
     }
 };
 
-using DeviceElementwiseSubExpInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, Sub_Exp, 256, 32, 8>;
+using DeviceElementwiseSubExpInstance =
+    ck::tensor_operation::device::DeviceElementwise_2D<CDataType,
+                                                       CDataType,
+                                                       CDataType,
+                                                       EltwiseComputeDataType,
+                                                       Sub_Exp,
+                                                       256,
+                                                       32,
+                                                       8>;
 
 using DeviceElementwiseDivInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, Div, 256, 32, 8>;
+    DeviceElementwise_2D<CDataType, CDataType, CDataType, EltwiseComputeDataType, Div, 256, 32, 8>;
 
 using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
@@ -199,6 +206,7 @@ using HostReduceSumInstance = ReductionHost<HostReduceDataType,
 template <typename HostTensorA,
           typename HostTensorB,
           typename HostTensorC,
+          typename ComputeDataType,
           typename Functor,
           int broadcastDim>
 void host_broadcast2D(
@@ -208,10 +216,19 @@ void host_broadcast2D(
     {
         for(int n = 0; n < N; ++n)
         {
+            ComputeDataType Amn = static_cast<ComputeDataType>(A(m, n));
+            ComputeDataType Cmn = 0;
             if constexpr(broadcastDim == 1)
-                functor(C(m, n), A(m, n), B(n));
+            {
+                ComputeDataType Bn = static_cast<ComputeDataType>(B(n));
+                functor(Cmn, Amn, Bn);
+            }
             else
-                functor(C(m, n), A(m, n), B(m));
+            {
+                ComputeDataType Bm = static_cast<ComputeDataType>(B(m));
+                functor(Cmn, Amn, Bm);
+            }
+            C(m, n) = static_cast<ComputeDataType>(Cmn);
         }
     }
 }
@@ -490,8 +507,12 @@ int main(int argc, char* argv[])
                             reinterpret_cast<HostReduceDataType*>(host_c_n_max.mData.data()),
                             host_indices.mData.data());
 
-        host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Sub_Exp, 1>(
-            host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
+        host_broadcast2D<Tensor<CDataType>,
+                         Tensor<CDataType>,
+                         Tensor<CDataType>,
+                         EltwiseComputeDataType,
+                         Sub_Exp,
+                         1>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
 
         host_reduce_sum.Run(1, // alpha
                             reinterpret_cast<const HostReduceDataType*>(exp_m_n.mData.data()),
@@ -499,8 +520,12 @@ int main(int argc, char* argv[])
                             reinterpret_cast<HostReduceDataType*>(host_exp_n_sum.mData.data()),
                             host_indices.mData.data());
 
-        host_broadcast2D<Tensor<CDataType>, Tensor<CDataType>, Tensor<CDataType>, Div, 1>(
-            host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{});
+        host_broadcast2D<Tensor<CDataType>,
+                         Tensor<CDataType>,
+                         Tensor<CDataType>,
+                         EltwiseComputeDataType,
+                         Div,
+                         1>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{});
 
         bool result = true;
         if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
index 182929f63cd..5d4054dad2b 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -13,6 +13,7 @@ namespace device {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename ComputeDataType,
           typename ElementwiseFunctor,
           index_t ThreadPerBlock,
           index_t ThreadTileSize,
@@ -43,6 +44,7 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
     using GridwiseEltwise = GridwiseElementwise_1D<ADataType,
                                                    BDataType,
                                                    CDataType,
+                                                   ComputeDataType,
                                                    GridDesc_M0,
                                                    ElementwiseFunctor,
                                                    ThreadPerBlock,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
index cdd86e0e950..789345c1312 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
@@ -33,6 +33,7 @@ __global__ void kernel_elementwise_1d(const ADataType* __restrict__ p_a_global,
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename ComputeDataType,
           typename GridDesc_M0,
           typename ElementwiseFunctor,
           index_t ThreadPerBlock,
@@ -70,15 +71,15 @@ struct GridwiseElementwise_1D
         auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_c_global, c_grid_desc_m0.GetElementSpaceSize());
 
-        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, CDataType, ScalarPerVector, true> c_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> c_thread_buf;
 
         const auto thread_to_global_offset = CalculateElementwiseIndex();
 
         auto a_global_load =
             ThreadwiseTensorSliceTransfer_v2<ADataType,
-                                             ADataType,
+                                             ComputeDataType,
                                              GridDesc_M0,
                                              decltype(thread_desc_M0),
                                              Sequence<ScalarPerVector>, // SliceLengths
@@ -90,7 +91,7 @@ struct GridwiseElementwise_1D
 
         auto b_global_load =
             ThreadwiseTensorSliceTransfer_v2<BDataType,
-                                             BDataType,
+                                             ComputeDataType,
                                              GridDesc_M0,
                                              decltype(thread_desc_M0),
                                              Sequence<ScalarPerVector>, // SliceLengths
@@ -101,7 +102,7 @@ struct GridwiseElementwise_1D
                                              false>{b_grid_desc_m0, thread_to_global_offset};
 
         auto c_global_write =
-            ThreadwiseTensorSliceTransfer_v1r3<CDataType,
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
                                                CDataType,
                                                decltype(thread_desc_M0),
                                                GridDesc_M0,

From 5fa209af9fa6e764f0b0fe610f7ec875a1497650 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 20 Apr 2022 12:12:51 +0000
Subject: [PATCH 18/28] Add padding

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |  1 -
 .../gpu/device/device_elementwise_2d.hpp      | 19 +++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 0accded132e..aafa54cca82 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -468,7 +468,6 @@ int main(int argc, char* argv[])
     auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer();
     broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat);
 
-    // TODO = do_verification
     if(do_verification)
     {
         std::cout << "verification..." << std::endl;
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
index 5d4054dad2b..ccccb5d6a85 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -34,10 +34,21 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
             make_naive_tensor_descriptor(make_tuple(m, n), make_tuple(stride[0], stride[1]));
 
         // 1d desc - [m * n]
-        return transform_tensor_descriptor(desc_m_n,
-                                           make_tuple(make_merge_transform(make_tuple(m, n))),
-                                           make_tuple(Sequence<0, 1>{}),
-                                           make_tuple(Sequence<0>{}));
+        const auto desc_m0 =
+            transform_tensor_descriptor(desc_m_n,
+                                        make_tuple(make_merge_transform(make_tuple(m, n))),
+                                        make_tuple(Sequence<0, 1>{}),
+                                        make_tuple(Sequence<0>{}));
+
+        // pad
+        const auto m0  = desc_m0.GetLength(I0);
+        const auto pad = math::integer_least_multiple(m0, ScalarPerVector) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
     }
 
     using GridDesc_M0     = decltype(MakeDescriptor_M0({1, 1}, {1, 1}));

From 0e6bf342df9b985094b36f8b5169d3ac750c4b4a Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 20 Apr 2022 12:24:36 +0000
Subject: [PATCH 19/28] Rename elementwise p[ to binary elementwise

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 10 +++---
 ...wise.hpp => device_binary_elementwise.hpp} |  2 +-
 ...d.hpp => device_binary_elementwise_2d.hpp} | 33 ++++++++++---------
 ...hpp => gridwise_binary_elementwise_1d.hpp} | 18 +++++-----
 4 files changed, 33 insertions(+), 30 deletions(-)
 rename include/ck/tensor_operation/gpu/device/{device_elementwise.hpp => device_binary_elementwise.hpp} (93%)
 rename include/ck/tensor_operation/gpu/device/{device_elementwise_2d.hpp => device_binary_elementwise_2d.hpp} (86%)
 rename include/ck/tensor_operation/gpu/grid/{gridwise_elementwise_1d.hpp => gridwise_binary_elementwise_1d.hpp} (95%)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index aafa54cca82..08aefd87c52 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -23,7 +23,7 @@
 #include "device_reduce_blockwise.hpp"
 #include "reduction_enums.hpp"
 #include "reduction_operator_mapping.hpp"
-#include "device_elementwise_2d.hpp"
+#include "device_binary_elementwise_2d.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -170,7 +170,7 @@ struct Div
 };
 
 using DeviceElementwiseSubExpInstance =
-    ck::tensor_operation::device::DeviceElementwise_2D<CDataType,
+    ck::tensor_operation::device::DeviceBinaryElementwise_2D<CDataType,
                                                        CDataType,
                                                        CDataType,
                                                        EltwiseComputeDataType,
@@ -180,7 +180,7 @@ using DeviceElementwiseSubExpInstance =
                                                        8>;
 
 using DeviceElementwiseDivInstance = ck::tensor_operation::device::
-    DeviceElementwise_2D<CDataType, CDataType, CDataType, EltwiseComputeDataType, Div, 256, 32, 8>;
+    DeviceBinaryElementwise_2D<CDataType, CDataType, CDataType, EltwiseComputeDataType, Div, 256, 32, 8>;
 
 using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
@@ -412,7 +412,7 @@ int main(int argc, char* argv[])
     if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get()))
     {
         throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceElementwise_2D instance, exiting!");
+                                 "DeviceBinaryElementwise_2D instance, exiting!");
     };
 
     auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer();
@@ -462,7 +462,7 @@ int main(int argc, char* argv[])
     if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get()))
     {
         throw std::runtime_error("The runtime parameters seems not supported by the "
-                                 "DeviceElementwise_2D instance, exiting!");
+                                 "DeviceBinaryElementwise_2D instance, exiting!");
     };
 
     auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer();
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
similarity index 93%
rename from include/ck/tensor_operation/gpu/device/device_elementwise.hpp
rename to include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
index e79e1112256..8809a1090ca 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 
 template <typename ElementwiseFunctor>
-struct DeviceElementwise : public BaseOperator
+struct DeviceBinaryElementwise : public BaseOperator
 {
 
     virtual std::unique_ptr<BaseArgument>
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
similarity index 86%
rename from include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
rename to include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
index ccccb5d6a85..fd67ed62746 100644
--- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
@@ -3,8 +3,8 @@
 #include <vector>
 
 #include "device.hpp"
-#include "device_elementwise.hpp"
-#include "gridwise_elementwise_1d.hpp"
+#include "device_binary_elementwise.hpp"
+#include "gridwise_binary_elementwise_1d.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -18,7 +18,7 @@ template <typename ADataType,
           index_t ThreadPerBlock,
           index_t ThreadTileSize,
           index_t ScalarPerVector>
-struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
+struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFunctor>
 {
     static_assert(ThreadTileSize % ScalarPerVector == 0);
     static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;
@@ -51,16 +51,16 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
         return desc_m0_pad;
     }
 
-    using GridDesc_M0     = decltype(MakeDescriptor_M0({1, 1}, {1, 1}));
-    using GridwiseEltwise = GridwiseElementwise_1D<ADataType,
-                                                   BDataType,
-                                                   CDataType,
-                                                   ComputeDataType,
-                                                   GridDesc_M0,
-                                                   ElementwiseFunctor,
-                                                   ThreadPerBlock,
-                                                   ThreadTileSize,
-                                                   ScalarPerVector>;
+    using GridDesc_M0        = decltype(MakeDescriptor_M0({1, 1}, {1, 1}));
+    using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
+                                                            BDataType,
+                                                            CDataType,
+                                                            ComputeDataType,
+                                                            GridDesc_M0,
+                                                            ElementwiseFunctor,
+                                                            ThreadPerBlock,
+                                                            ThreadTileSize,
+                                                            ScalarPerVector>;
 
     struct Argument : public BaseArgument
     {
@@ -101,7 +101,7 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
 
         float Run(const Argument& arg, int nrepeat = 1)
         {
-            const auto kernel      = kernel_elementwise_1d<GridwiseEltwise,
+            const auto kernel      = kernel_elementwise_1d<GridwiseBinEltwise,
                                                       ADataType,
                                                       BDataType,
                                                       CDataType,
@@ -192,8 +192,11 @@ struct DeviceElementwise_2D : public DeviceElementwise<ElementwiseFunctor>
         auto str = std::stringstream();
 
         // clang-format off
-        str << "DeviceElementwise_2D"
+        str << "DeviceBinaryElementwise_2D"
             << "<"
+            << "ThreadPerBlock = " << ThreadPerBlock
+            << "ThreadTileSize = " << ThreadTileSize
+            << "ScalarPerVector = " << ScalarPerVector
             << ">";
         // clang-format on
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
similarity index 95%
rename from include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
rename to include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
index 789345c1312..0e2be0e4519 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
@@ -7,7 +7,7 @@
 
 namespace ck {
 
-template <typename GridwiseEltwise,
+template <typename GridwiseBinEltwise,
           typename ADataType,
           typename BDataType,
           typename CDataType,
@@ -21,13 +21,13 @@ __global__ void kernel_elementwise_1d(const ADataType* __restrict__ p_a_global,
                                       const GridDesc_M0 c_grid_desc_m0,
                                       const ElementwiseFunctor functor)
 {
-    GridwiseEltwise::Run(p_a_global,
-                         p_b_global,
-                         p_c_global,
-                         a_grid_desc_m0,
-                         b_grid_desc_m0,
-                         c_grid_desc_m0,
-                         functor);
+    GridwiseBinEltwise::Run(p_a_global,
+                            p_b_global,
+                            p_c_global,
+                            a_grid_desc_m0,
+                            b_grid_desc_m0,
+                            c_grid_desc_m0,
+                            functor);
 }
 
 template <typename ADataType,
@@ -39,7 +39,7 @@ template <typename ADataType,
           index_t ThreadPerBlock,
           index_t ThreadTileSize,
           index_t ScalarPerVector>
-struct GridwiseElementwise_1D
+struct GridwiseBinaryElementwise_1D
 {
     static constexpr auto I0           = Number<0>{};
     static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;

From d7112d37f2d83e3d8b7f384dac7a60abe1adfc49 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Wed, 20 Apr 2022 17:21:02 +0000
Subject: [PATCH 20/28] Fix the padding

---
 .../gpu/device/device_binary_elementwise_2d.hpp                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
index fd67ed62746..5417314683f 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
@@ -42,7 +42,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
 
         // pad
         const auto m0  = desc_m0.GetLength(I0);
-        const auto pad = math::integer_least_multiple(m0, ScalarPerVector) - m0;
+        const auto pad = math::integer_least_multiple(m0, BlockTileSize) - m0;
         const auto desc_m0_pad =
             transform_tensor_descriptor(desc_m0,
                                         make_tuple(make_right_pad_transform(m0, pad)),

From 5d36f7a2eb9e322229b44ffb9c571ad14b4135a5 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Thu, 21 Apr 2022 15:16:17 +0000
Subject: [PATCH 21/28] Rewrite the elementwise operation. Let memory coalesce
 between block

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 25 ++++++----
 .../gpu/device/device_binary_elementwise.hpp  | 17 ++++---
 .../device/device_binary_elementwise_2d.hpp   | 47 ++++++++-----------
 .../grid/gridwise_binary_elementwise_1d.hpp   | 26 +++++-----
 include/ck/utility/get_id.hpp                 |  4 ++
 5 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 08aefd87c52..9d31f62ba6b 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -171,16 +171,21 @@ struct Div
 
 using DeviceElementwiseSubExpInstance =
     ck::tensor_operation::device::DeviceBinaryElementwise_2D<CDataType,
-                                                       CDataType,
-                                                       CDataType,
-                                                       EltwiseComputeDataType,
-                                                       Sub_Exp,
-                                                       256,
-                                                       32,
-                                                       8>;
-
-using DeviceElementwiseDivInstance = ck::tensor_operation::device::
-    DeviceBinaryElementwise_2D<CDataType, CDataType, CDataType, EltwiseComputeDataType, Div, 256, 32, 8>;
+                                                             CDataType,
+                                                             CDataType,
+                                                             EltwiseComputeDataType,
+                                                             Sub_Exp,
+                                                             256,
+                                                             8>;
+
+using DeviceElementwiseDivInstance =
+    ck::tensor_operation::device::DeviceBinaryElementwise_2D<CDataType,
+                                                             CDataType,
+                                                             CDataType,
+                                                             EltwiseComputeDataType,
+                                                             Div,
+                                                             256,
+                                                             8>;
 
 using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
index 8809a1090ca..56ff5141fcb 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -12,15 +12,14 @@ template <typename ElementwiseFunctor>
 struct DeviceBinaryElementwise : public BaseOperator
 {
 
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        void* p_c,
-                        const std::vector<int>& shape_a,
-                        const std::vector<int>& stride_a,
-                        const std::vector<int>& shape_b,
-                        const std::vector<int>& stride_b,
-                        ElementwiseFunctor functor) = 0;
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              const std::vector<int>& shape_a,
+                                                              const std::vector<int>& stride_a,
+                                                              const std::vector<int>& shape_b,
+                                                              const std::vector<int>& stride_b,
+                                                              ElementwiseFunctor functor) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
index 5417314683f..3fa679cc2fe 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
@@ -16,15 +16,14 @@ template <typename ADataType,
           typename ComputeDataType,
           typename ElementwiseFunctor,
           index_t ThreadPerBlock,
-          index_t ThreadTileSize,
           index_t ScalarPerVector>
 struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFunctor>
 {
-    static_assert(ThreadTileSize % ScalarPerVector == 0);
-    static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;
-    static constexpr auto I0           = Number<0>{};
+    static constexpr auto I0 = Number<0>{};
 
-    static auto MakeDescriptor_M0(const std::vector<int>& shape, const std::vector<int>& stride)
+    static auto MakeDescriptor_M0(const std::vector<int>& shape,
+                                  const std::vector<int>& stride,
+                                  index_t gridSize)
     {
         const int m = shape[0];
         const int n = shape[1];
@@ -41,8 +40,9 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                                         make_tuple(Sequence<0>{}));
 
         // pad
-        const auto m0  = desc_m0.GetLength(I0);
-        const auto pad = math::integer_least_multiple(m0, BlockTileSize) - m0;
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * ThreadPerBlock * ScalarPerVector;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
         const auto desc_m0_pad =
             transform_tensor_descriptor(desc_m0,
                                         make_tuple(make_right_pad_transform(m0, pad)),
@@ -51,15 +51,13 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         return desc_m0_pad;
     }
 
-    using GridDesc_M0        = decltype(MakeDescriptor_M0({1, 1}, {1, 1}));
+    using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1));
     using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
                                                             BDataType,
                                                             CDataType,
                                                             ComputeDataType,
                                                             GridDesc_M0,
                                                             ElementwiseFunctor,
-                                                            ThreadPerBlock,
-                                                            ThreadTileSize,
                                                             ScalarPerVector>;
 
     struct Argument : public BaseArgument
@@ -75,11 +73,12 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
             : p_a_(p_a),
               p_b_(p_b),
               p_c_(p_c),
-              a_grid_desc_m0_(MakeDescriptor_M0(shape, stride_a)),
-              b_grid_desc_m0_(MakeDescriptor_M0(shape, stride_b)),
-              c_grid_desc_m0_(MakeDescriptor_M0(shape, stride_c)),
-              functor_(functor)
+              functor_(functor),
+              gridSize_(128) // FIXME - Calculate the grid size by number of CU in the future
         {
+            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_);
+            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_);
+            c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_);
         }
 
         const ADataType* p_a_;
@@ -89,30 +88,25 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         GridDesc_M0 b_grid_desc_m0_;
         GridDesc_M0 c_grid_desc_m0_;
         ElementwiseFunctor functor_;
+        index_t gridSize_;
     };
 
     struct Invoker : public BaseInvoker
     {
-        index_t CalculateGridSize(const GridDesc_M0& grid_desc_m0)
-        {
-            const auto gridTileSize = grid_desc_m0.GetLength(I0);
-            return gridTileSize / BlockTileSize;
-        }
-
         float Run(const Argument& arg, int nrepeat = 1)
         {
-            const auto kernel      = kernel_elementwise_1d<GridwiseBinEltwise,
+            (void)arg;
+            const auto kernel = kernel_elementwise_1d<GridwiseBinEltwise,
                                                       ADataType,
                                                       BDataType,
                                                       CDataType,
                                                       GridDesc_M0,
                                                       ElementwiseFunctor>;
-            float avgTime          = 0;
-            const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m0_);
+            float avgTime = 0;
             if(nrepeat == 0)
             {
                 launch_kernel(kernel,
-                              dim3(gridSize),
+                              dim3(arg.gridSize_),
                               dim3(ThreadPerBlock),
                               0,
                               arg.p_a_,
@@ -127,7 +121,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
             {
                 avgTime = launch_and_time_kernel(kernel,
                                                  nrepeat,
-                                                 dim3(gridSize),
+                                                 dim3(arg.gridSize_),
                                                  dim3(ThreadPerBlock),
                                                  0,
                                                  arg.p_a_,
@@ -157,7 +151,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         // m * n
         const auto m0 = pArg->c_grid_desc_m0_.GetLength(I0);
 
-        if(m0 % BlockTileSize != 0)
+        if(m0 % ScalarPerVector != 0)
             return false;
 
         return true;
@@ -195,7 +189,6 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         str << "DeviceBinaryElementwise_2D"
             << "<"
             << "ThreadPerBlock = " << ThreadPerBlock
-            << "ThreadTileSize = " << ThreadTileSize
             << "ScalarPerVector = " << ScalarPerVector
             << ">";
         // clang-format on
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
index 0e2be0e4519..18eb4017e29 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
@@ -36,13 +36,10 @@ template <typename ADataType,
           typename ComputeDataType,
           typename GridDesc_M0,
           typename ElementwiseFunctor,
-          index_t ThreadPerBlock,
-          index_t ThreadTileSize,
           index_t ScalarPerVector>
 struct GridwiseBinaryElementwise_1D
 {
-    static constexpr auto I0           = Number<0>{};
-    static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize;
+    static constexpr auto I0 = Number<0>{};
     static constexpr auto thread_desc_M0 =
         make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
 
@@ -50,10 +47,8 @@ struct GridwiseBinaryElementwise_1D
 
     static __device__ __host__ auto CalculateElementwiseIndex()
     {
-        const index_t thread_id = get_thread_local_1d_id();
-        const index_t block_id  = get_block_1d_id();
-
-        return make_multi_index(block_id * BlockTileSize + thread_id * ScalarPerVector);
+        const index_t global_thread_id = get_thread_global_1d_id();
+        return make_multi_index(global_thread_id * ScalarPerVector);
     }
 
     __device__ static void Run(const ADataType* __restrict__ p_a_global,
@@ -116,8 +111,13 @@ struct GridwiseBinaryElementwise_1D
                                                false>{
                 c_grid_desc_m0, thread_to_global_offset, PassThrough{}};
 
-        int num_iter                         = ThreadTileSize / ScalarPerVector;
-        constexpr auto thread_to_global_step = make_multi_index(ThreadPerBlock * ScalarPerVector);
+        const index_t threadPerBlock = get_block_size();
+        const index_t blockPerGrid   = get_grid_size();
+        const auto m0                = c_grid_desc_m0.GetLength(I0);
+        const index_t loop_step      = blockPerGrid * threadPerBlock * ScalarPerVector;
+        const auto loop_step_index   = make_multi_index(loop_step);
+
+        index_t num_iter = m0 / (loop_step);
         do
         {
             // read and process ScalarPerVector elements
@@ -140,9 +140,9 @@ struct GridwiseBinaryElementwise_1D
                                c_grid_desc_m0,
                                c_global_buf);
 
-            a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, thread_to_global_step);
-            b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, thread_to_global_step);
-            c_global_write.MoveDstSliceWindow(c_grid_desc_m0, thread_to_global_step);
+            a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index);
+            b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, loop_step_index);
+            c_global_write.MoveDstSliceWindow(c_grid_desc_m0, loop_step_index);
         } while(--num_iter);
     }
 };
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
index f742512d400..d2a689c1cca 100644
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -7,10 +7,14 @@ __device__ constexpr index_t get_wave_size() { return CK_GPU_WAVE_SIZE; }
 
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
 
+__device__ index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
+
 __device__ index_t get_wave_local_1d_id() { return threadIdx.x / get_wave_size(); }
 
 __device__ index_t get_block_1d_id() { return blockIdx.x; }
 
 __device__ index_t get_grid_size() { return gridDim.x; }
 
+__device__ index_t get_block_size() { return blockDim.x; }
+
 } // namespace ck

From 680cfaa715dbae4d079cc6bd5b676b909f371376 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Fri, 22 Apr 2022 04:20:08 +0000
Subject: [PATCH 22/28] Fix the meaning of broadcast dim parameter

---
 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 9d31f62ba6b..a0522f7fc79 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -223,7 +223,7 @@ void host_broadcast2D(
         {
             ComputeDataType Amn = static_cast<ComputeDataType>(A(m, n));
             ComputeDataType Cmn = 0;
-            if constexpr(broadcastDim == 1)
+            if constexpr(broadcastDim == 0)
             {
                 ComputeDataType Bn = static_cast<ComputeDataType>(B(n));
                 functor(Cmn, Amn, Bn);
@@ -516,7 +516,7 @@ int main(int argc, char* argv[])
                          Tensor<CDataType>,
                          EltwiseComputeDataType,
                          Sub_Exp,
-                         1>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
+                         0>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
 
         host_reduce_sum.Run(1, // alpha
                             reinterpret_cast<const HostReduceDataType*>(exp_m_n.mData.data()),
@@ -529,7 +529,7 @@ int main(int argc, char* argv[])
                          Tensor<CDataType>,
                          EltwiseComputeDataType,
                          Div,
-                         1>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{});
+                         0>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{});
 
         bool result = true;
         if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData))

From a41f5481d4adf974864dc48ace28bd395ba6223b Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 25 Apr 2022 09:12:23 +0000
Subject: [PATCH 23/28] 1. Fix coding style 2. Use DeviceGemm_Xdl_CShuffle
 instead of deprecated DeviceGemmXdl_C_Shuffle

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 26 +++++++++++--------
 .../grid/gridwise_binary_elementwise_1d.hpp   | 16 ++++++------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index a0522f7fc79..0b4d0c41b34 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -15,7 +15,7 @@
 
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"
@@ -50,19 +50,23 @@ using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
 using CLayout = ck::tensor_layout::gemm::RowMajor;
 
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle<
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
+    ALayout,                // ALayout
+    BLayout,                // BLayout
+    CLayout,                // CLayout
     ADataType,              // ADataType
     BDataType,              // BDataType
     CDataType,              // CDataType
     AccDataType,            // AccDataType
     CDataType,              // CShuffleDataType
-    ALayout,                // ALayout
-    BLayout,                // BLayout
-    CLayout,                // CLayout
     PassThrough,            // AElementwiseOperation
     PassThrough,            // BElementwiseOperation
     PassThrough,            // CElementwiseOperation
+    GemmDefault,            // GemmSpec
+    1,                      // NumGemmKPrefetchStage
     256,                    // BlockSize
     256,                    // MPerBlock
     128,                    // NPerBlock
@@ -89,7 +93,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     true,                   // BBlockLdsAddExtraN
     1,                      // CShuffleMXdlPerWavePerShuffle
     1,                      // CShuffleNXdlPerWavePerShuffle
-    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+    S<1, 32, 1, 8>,         // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
     8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
@@ -149,7 +153,7 @@ using DeviceReduceSumInstance =
                                                         1,
                                                         1>;
 
-struct Sub_Exp
+struct SubExp
 {
     __host__ __device__ constexpr void operator()(EltwiseComputeDataType& dst,
                                                   const EltwiseComputeDataType& src1,
@@ -174,7 +178,7 @@ using DeviceElementwiseSubExpInstance =
                                                              CDataType,
                                                              CDataType,
                                                              EltwiseComputeDataType,
-                                                             Sub_Exp,
+                                                             SubExp,
                                                              256,
                                                              8>;
 
@@ -412,7 +416,7 @@ int main(int argc, char* argv[])
                                             {StrideC, 1},
                                             {0, 1},
                                             {StrideC, 1},
-                                            Sub_Exp{});
+                                            SubExp{});
 
     if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get()))
     {
@@ -515,8 +519,8 @@ int main(int argc, char* argv[])
                          Tensor<CDataType>,
                          Tensor<CDataType>,
                          EltwiseComputeDataType,
-                         Sub_Exp,
-                         0>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{});
+                         SubExp,
+                         0>(host_exp_m_n, c_m_n, c_n_max, M, N, SubExp{});
 
         host_reduce_sum.Run(1, // alpha
                             reinterpret_cast<const HostReduceDataType*>(exp_m_n.mData.data()),
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
index 18eb4017e29..aea54ff53c4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
@@ -40,7 +40,7 @@ template <typename ADataType,
 struct GridwiseBinaryElementwise_1D
 {
     static constexpr auto I0 = Number<0>{};
-    static constexpr auto thread_desc_M0 =
+    static constexpr auto thread_desc_m0 =
         make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
 
     using PassThrough = tensor_operation::element_wise::PassThrough;
@@ -76,7 +76,7 @@ struct GridwiseBinaryElementwise_1D
             ThreadwiseTensorSliceTransfer_v2<ADataType,
                                              ComputeDataType,
                                              GridDesc_M0,
-                                             decltype(thread_desc_M0),
+                                             decltype(thread_desc_m0),
                                              Sequence<ScalarPerVector>, // SliceLengths
                                              Sequence<0>,               // DimAccessOrder
                                              0,                         // SrcVectorDim
@@ -88,7 +88,7 @@ struct GridwiseBinaryElementwise_1D
             ThreadwiseTensorSliceTransfer_v2<BDataType,
                                              ComputeDataType,
                                              GridDesc_M0,
-                                             decltype(thread_desc_M0),
+                                             decltype(thread_desc_m0),
                                              Sequence<ScalarPerVector>, // SliceLengths
                                              Sequence<0>,               // DimAccessOrder
                                              0,                         // SrcVectorDim
@@ -99,7 +99,7 @@ struct GridwiseBinaryElementwise_1D
         auto c_global_write =
             ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
                                                CDataType,
-                                               decltype(thread_desc_M0),
+                                               decltype(thread_desc_m0),
                                                GridDesc_M0,
                                                PassThrough,
                                                Sequence<ScalarPerVector>, // SliceLengths
@@ -122,19 +122,19 @@ struct GridwiseBinaryElementwise_1D
         {
             // read and process ScalarPerVector elements
             a_global_load.Run(
-                a_grid_desc_m0, a_global_buf, thread_desc_M0, make_tuple(I0), a_thread_buf);
+                a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf);
 
             b_global_load.Run(
-                b_grid_desc_m0, b_global_buf, thread_desc_M0, make_tuple(I0), b_thread_buf);
+                b_grid_desc_m0, b_global_buf, thread_desc_m0, make_tuple(I0), b_thread_buf);
 
             static_for<0, ScalarPerVector, 1>{}([&](auto m) {
-                constexpr auto offset = thread_desc_M0.CalculateOffset(make_tuple(m));
+                constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
                 functor(c_thread_buf(Number<offset>{}),
                         a_thread_buf(Number<offset>{}),
                         b_thread_buf(Number<offset>{}));
             });
 
-            c_global_write.Run(thread_desc_M0,
+            c_global_write.Run(thread_desc_m0,
                                make_tuple(I0), // SrcSliceOriginIdx
                                c_thread_buf,
                                c_grid_desc_m0,

From f919809dfa5c174117dfca7bfa1bb283cd37925c Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Tue, 26 Apr 2022 01:54:08 +0000
Subject: [PATCH 24/28] Move threadPerBlock to argument

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp |  8 ++---
 .../gpu/device/device_binary_elementwise.hpp  |  3 +-
 .../device/device_binary_elementwise_2d.hpp   | 32 +++++++++++--------
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 0b4d0c41b34..b477ff68d46 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -179,7 +179,6 @@ using DeviceElementwiseSubExpInstance =
                                                              CDataType,
                                                              EltwiseComputeDataType,
                                                              SubExp,
-                                                             256,
                                                              8>;
 
 using DeviceElementwiseDivInstance =
@@ -188,7 +187,6 @@ using DeviceElementwiseDivInstance =
                                                              CDataType,
                                                              EltwiseComputeDataType,
                                                              Div,
-                                                             256,
                                                              8>;
 
 using HostGemmInstance = ck::tensor_operation::host::
@@ -416,7 +414,8 @@ int main(int argc, char* argv[])
                                             {StrideC, 1},
                                             {0, 1},
                                             {StrideC, 1},
-                                            SubExp{});
+                                            SubExp{},
+                                            256);
 
     if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get()))
     {
@@ -466,7 +465,8 @@ int main(int argc, char* argv[])
                                          {StrideC, 1},
                                          {0, 1},
                                          {StrideC, 1},
-                                         Div{});
+                                         Div{},
+                                         256);
 
     if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get()))
     {
diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
index 56ff5141fcb..f97e7d835ff 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -19,7 +19,8 @@ struct DeviceBinaryElementwise : public BaseOperator
                                                               const std::vector<int>& stride_a,
                                                               const std::vector<int>& shape_b,
                                                               const std::vector<int>& stride_b,
-                                                              ElementwiseFunctor functor) = 0;
+                                                              ElementwiseFunctor functor,
+                                                              index_t threadPerBlock) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
index 3fa679cc2fe..370efd483e5 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
@@ -15,7 +15,6 @@ template <typename ADataType,
           typename CDataType,
           typename ComputeDataType,
           typename ElementwiseFunctor,
-          index_t ThreadPerBlock,
           index_t ScalarPerVector>
 struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFunctor>
 {
@@ -23,7 +22,8 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
 
     static auto MakeDescriptor_M0(const std::vector<int>& shape,
                                   const std::vector<int>& stride,
-                                  index_t gridSize)
+                                  index_t gridSize,
+                                  index_t threadPerBlock)
     {
         const int m = shape[0];
         const int n = shape[1];
@@ -41,7 +41,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
 
         // pad
         const auto m0           = desc_m0.GetLength(I0);
-        const index_t loop_step = gridSize * ThreadPerBlock * ScalarPerVector;
+        const index_t loop_step = gridSize * threadPerBlock * ScalarPerVector;
         const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
         const auto desc_m0_pad =
             transform_tensor_descriptor(desc_m0,
@@ -51,7 +51,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         return desc_m0_pad;
     }
 
-    using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1));
+    using GridDesc_M0        = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
     using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
                                                             BDataType,
                                                             CDataType,
@@ -69,16 +69,18 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                  const std::vector<int>& stride_a,
                  const std::vector<int>& stride_b,
                  const std::vector<int>& stride_c,
-                 ElementwiseFunctor functor)
+                 ElementwiseFunctor functor,
+                 index_t threadPerBlock)
             : p_a_(p_a),
               p_b_(p_b),
               p_c_(p_c),
               functor_(functor),
+              threadPerBlock_(threadPerBlock),
               gridSize_(128) // FIXME - Calculate the grid size by number of CU in the future
         {
-            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_);
-            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_);
-            c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_);
+            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, threadPerBlock_);
+            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, threadPerBlock_);
+            c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, threadPerBlock_);
         }
 
         const ADataType* p_a_;
@@ -88,6 +90,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         GridDesc_M0 b_grid_desc_m0_;
         GridDesc_M0 c_grid_desc_m0_;
         ElementwiseFunctor functor_;
+        index_t threadPerBlock_;
         index_t gridSize_;
     };
 
@@ -102,12 +105,12 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                                                       CDataType,
                                                       GridDesc_M0,
                                                       ElementwiseFunctor>;
-            float avgTime = 0;
+            float avgTime     = 0;
             if(nrepeat == 0)
             {
                 launch_kernel(kernel,
                               dim3(arg.gridSize_),
-                              dim3(ThreadPerBlock),
+                              dim3(arg.threadPerBlock_),
                               0,
                               arg.p_a_,
                               arg.p_b_,
@@ -122,7 +125,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                 avgTime = launch_and_time_kernel(kernel,
                                                  nrepeat,
                                                  dim3(arg.gridSize_),
-                                                 dim3(ThreadPerBlock),
+                                                 dim3(arg.threadPerBlock_),
                                                  0,
                                                  arg.p_a_,
                                                  arg.p_b_,
@@ -164,7 +167,8 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                                                       const std::vector<int>& stride_a,
                                                       const std::vector<int>& stride_b,
                                                       const std::vector<int>& stride_c,
-                                                      ElementwiseFunctor functor) override
+                                                      ElementwiseFunctor functor,
+                                                      index_t threadPerBlock) override
     {
         return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                           static_cast<const BDataType*>(p_b),
@@ -173,7 +177,8 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
                                           stride_a,
                                           stride_b,
                                           stride_c,
-                                          functor);
+                                          functor,
+                                          threadPerBlock);
     }
 
     std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
@@ -188,7 +193,6 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
         // clang-format off
         str << "DeviceBinaryElementwise_2D"
             << "<"
-            << "ThreadPerBlock = " << ThreadPerBlock
             << "ScalarPerVector = " << ScalarPerVector
             << ">";
         // clang-format on

From 976815e539cf6e67bd742e2521aca7ddd60f6024 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Thu, 28 Apr 2022 16:18:18 +0800
Subject: [PATCH 25/28] Prevent compile error when user pass rvalue, eg {3, 4}

---
 .../gpu/device/device_binary_elementwise.hpp              | 8 ++++----
 .../gpu/device/device_binary_elementwise_2d.hpp           | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
index f97e7d835ff..eba2d7979f2 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -15,10 +15,10 @@ struct DeviceBinaryElementwise : public BaseOperator
     virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                               const void* p_b,
                                                               void* p_c,
-                                                              const std::vector<int>& shape_a,
-                                                              const std::vector<int>& stride_a,
-                                                              const std::vector<int>& shape_b,
-                                                              const std::vector<int>& stride_b,
+                                                              std::vector<int> shape_a,
+                                                              std::vector<int> stride_a,
+                                                              std::vector<int> shape_b,
+                                                              std::vector<int> stride_b,
                                                               ElementwiseFunctor functor,
                                                               index_t threadPerBlock) = 0;
 
diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
index 370efd483e5..9d37a1d6439 100644
--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp
@@ -163,10 +163,10 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise<ElementwiseFu
     std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                       const void* p_b,
                                                       void* p_c,
-                                                      const std::vector<int>& shape,
-                                                      const std::vector<int>& stride_a,
-                                                      const std::vector<int>& stride_b,
-                                                      const std::vector<int>& stride_c,
+                                                      std::vector<int> shape,
+                                                      std::vector<int> stride_a,
+                                                      std::vector<int> stride_b,
+                                                      std::vector<int> stride_c,
                                                       ElementwiseFunctor functor,
                                                       index_t threadPerBlock) override
     {

From ea09fd3260ca7e342c3ed2757c4cd45c0f119e52 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Thu, 28 Apr 2022 16:42:53 +0800
Subject: [PATCH 26/28] Fix typo

---
 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index b477ff68d46..156e3f467d5 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -42,7 +42,7 @@ using CDataType              = F16;
 using AccDataType            = F32;
 using EltwiseComputeDataType = F32;
 
-// CAUSION - host reduce_max will call numeric_limits<ck::half_t>::lowest()
+// CAUTION - host reduce_max will call numeric_limits<ck::half_t>::lowest()
 // However, numeric_limits<ck::half_t>::lowest() will return zero. So, used half_float::half instead
 using HostReduceDataType = half_float::half;
 

From bfc80764c257c3f4a403da7780c4c1dd9125c347 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Fri, 29 Apr 2022 16:14:17 +0800
Subject: [PATCH 27/28] [What] Fix data type for host reduction [Why] F16 issue
 for host reduction has been fix in c1ef73192e9303f48bac53327150dac4983af51d

---
 .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index 156e3f467d5..bce9fefae1a 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -42,10 +42,6 @@ using CDataType              = F16;
 using AccDataType            = F32;
 using EltwiseComputeDataType = F32;
 
-// CAUTION - host reduce_max will call numeric_limits<ck::half_t>::lowest()
-// However, numeric_limits<ck::half_t>::lowest() will return zero. So, used half_float::half instead
-using HostReduceDataType = half_float::half;
-
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
 using CLayout = ck::tensor_layout::gemm::RowMajor;
@@ -192,18 +188,18 @@ using DeviceElementwiseDivInstance =
 using HostGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
 
-using HostReduceMaxInstance = ReductionHost<HostReduceDataType,
-                                            HostReduceDataType,
-                                            HostReduceDataType,
+using HostReduceMaxInstance = ReductionHost<CDataType,
+                                            CDataType,
+                                            CDataType,
                                             ReduceMaxId,
                                             Rank,
                                             NumReduceDim,
                                             ReducePropagateNan,
                                             false>;
 
-using HostReduceSumInstance = ReductionHost<HostReduceDataType,
+using HostReduceSumInstance = ReductionHost<CDataType,
                                             AccDataType,
-                                            HostReduceDataType,
+                                            CDataType,
                                             ReduceSumId,
                                             Rank,
                                             NumReduceDim,
@@ -510,9 +506,9 @@ int main(int argc, char* argv[])
 
         host_gemm_invoker.Run(host_gemm_argument);
         host_reduce_max.Run(1, // alpha
-                            reinterpret_cast<const HostReduceDataType*>(c_m_n.mData.data()),
+                            c_m_n.mData.data(),
                             0, // beta
-                            reinterpret_cast<HostReduceDataType*>(host_c_n_max.mData.data()),
+                            host_c_n_max.mData.data(),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>,
@@ -523,9 +519,9 @@ int main(int argc, char* argv[])
                          0>(host_exp_m_n, c_m_n, c_n_max, M, N, SubExp{});
 
         host_reduce_sum.Run(1, // alpha
-                            reinterpret_cast<const HostReduceDataType*>(exp_m_n.mData.data()),
+                            exp_m_n.mData.data(),
                             0, // beta
-                            reinterpret_cast<HostReduceDataType*>(host_exp_n_sum.mData.data()),
+                            host_exp_n_sum.mData.data(),
                             host_indices.mData.data());
 
         host_broadcast2D<Tensor<CDataType>,

From b6fe118ed0355b91956e6ad03ede09b55dc93264 Mon Sep 17 00:00:00 2001
From: rocking <chunyu.lai@amd.com>
Date: Mon, 9 May 2022 13:37:42 +0800
Subject: [PATCH 28/28] Fix typo

---
 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
index bce9fefae1a..13518c1a6e4 100644
--- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
+++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp
@@ -373,8 +373,8 @@ int main(int argc, char* argv[])
 
     // do reduce max
     auto reduce_max                 = DeviceReduceMaxInstance{};
-    auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
-    DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size);
+    auto reduce_max_workspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims);
+    DeviceMem reduce_max_workaspace_device_buf(reduce_max_workspace_size);
 
     auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer(
         c_m_n_shape,