From d6e053a3ce8fb8212ec374a519aaec6bd7d43bce Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 4 Apr 2022 04:16:01 +0800 Subject: [PATCH 01/28] Part of gemm + softmax, Add gemm + reduceMax --- example/19_gemm_softmax/CMakeLists.txt | 1 + .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 281 ++++++++++++++++++ example/CMakeLists.txt | 1 + 3 files changed, 283 insertions(+) create mode 100644 example/19_gemm_softmax/CMakeLists.txt create mode 100644 example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp diff --git a/example/19_gemm_softmax/CMakeLists.txt b/example/19_gemm_softmax/CMakeLists.txt new file mode 100644 index 00000000000..740f931e5ab --- /dev/null +++ b/example/19_gemm_softmax/CMakeLists.txt @@ -0,0 +1 @@ +add_example_executable(example_gemm_softmax_xdl_fp16 gemm_softmax_xdl_fp16.cpp) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp new file mode 100644 index 00000000000..cb60ae42372 --- /dev/null +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -0,0 +1,281 @@ +#include +#include +#include +#include +#include +#include +#include "config.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_reduce_util.hpp" +#include "device_tensor.hpp" +#include "device_gemm_xdl.hpp" +#include "device_gemm_xdl_c_shuffle.hpp" +#include "element_wise_operation.hpp" +#include "reference_gemm.hpp" +#include "gemm_specialization.hpp" + +#include "device_reduce_blockwise.hpp" +#include "reduction_enums.hpp" +#include "reduction_operator_mapping.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using BDataType = F16; +using CDataType = F16; +using AccDataType = F32; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle< + ADataType, // ADataType + BDataType, // BDataType + CDataType, // CDataType + AccDataType, // AccDataType + CDataType, // CShuffleDataType + ALayout, // ALayout + BLayout, // BLayout + CLayout, // CLayout + PassThrough, // AElementwiseOperation + PassThrough, // BElementwiseOperation + PassThrough, // CElementwiseOperation + 256, // BlockSize + 256, // MPerBlock + 128, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXDL + 32, // NPerXDL + 4, // MXdlPerWave + 2, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockLdsAddExtraN + 1, // CShuffleMXdlPerWavePerShuffle + 1, // CShuffleNXdlPerWavePerShuffle + S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl +// clang-format on + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; +constexpr ck::ReduceTensorOp_t ReduceOpId = ck::ReduceTensorOp_t::MAX; +constexpr ck::NanPropagation_t NanOpt = ck::NanPropagation_t::PROPAGATE_NAN; +constexpr bool PropagateNan = (NanOpt == ck::NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; +// constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES; +using ReduceOperation = typename ck::reduce_binary_operator::opType; +using InElementwiseOperation = + typename ck::reduce_unary_operator::InElementwiseOperation; +using AccElementwiseOperation = + typename ck::reduce_unary_operator::AccElementwiseOperation; + +using DeviceReduceInstance = + ck::tensor_operation::device::DeviceReduceBlockWise; + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +int main(int argc, char* argv[]) +{ + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + const std::vector reduceDims{0}; + const std::vector reduceInvariantDims{1}; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: run kernel # of times (>1)\n"); + printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); + exit(0); + } + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(std::is_same::value) + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + } + else + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({1, stride})); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_max(std::vector({static_cast(N)}), + std::vector({1})); + + const auto i_inLengths = ck::to_int_vector(c_m_n.mDesc.GetLengths()); + const auto i_inStrides = ck::to_int_vector(c_m_n.mDesc.GetStrides()); + const auto i_outLengths = ck::to_int_vector(c_m_n_max.mDesc.GetLengths()); + const auto i_outStrides = ck::to_int_vector(c_m_n_max.mDesc.GetStrides()); + + size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_m_n_max.mDesc.GetElementSize(); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n.mDesc << std::endl; + std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace()); + DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace()); + DeviceMem c_m_n_max_indices_dev(0); + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n.mData.data()); + + // do GEMM + auto gemm = DeviceGemmInstance{}; + auto gemm_invoker = gemm.MakeInvoker(); + auto gemm_argument = + gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + if(!gemm.IsSupportedArgument(gemm_argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + gemm_invoker.Run(gemm_argument, nrepeat); + + // do reduce max + auto reduce = DeviceReduceInstance{}; + auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + DeviceMem ws_dev(wsSizeInBytes); + + auto argument_ptr = + reduce.MakeArgumentPointer(i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + 1, + 0, + c_m_n_device_buf.GetDeviceBuffer(), + c_m_n_max_device_buf.GetDeviceBuffer(), + c_m_n_max_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation{static_cast(reduce_total_length)}, + AccElementwiseOperation{static_cast(reduce_total_length)}); + + if(!reduce.IsSupportedArgument(argument_ptr.get())) + { + std::cout + << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" + << std::endl; + }; + + auto invoker_ptr = reduce.MakeInvokerPointer(); + invoker_ptr->Run(argument_ptr.get(), nrepeat); + + // TODO = do_verification + if(do_verification) + ; + return 0; +} diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 830d1189de5..92d6945a466 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -43,3 +43,4 @@ add_subdirectory(17_convnd_bwd_data_xdl) add_subdirectory(15_grouped_gemm) add_subdirectory(16_gemm_reduce) add_subdirectory(18_batched_gemm_reduce) +add_subdirectory(19_gemm_softmax) From cbbc7e52393f31d6e823d7f4aa691b8cb70e659b Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 6 Apr 2022 10:35:12 +0000 Subject: [PATCH 02/28] Refine the comment --- example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index cb60ae42372..d11d14b79d4 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -274,8 +274,8 @@ int main(int argc, char* argv[]) auto invoker_ptr = reduce.MakeInvokerPointer(); invoker_ptr->Run(argument_ptr.get(), nrepeat); + // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv // TODO = do_verification - if(do_verification) - ; + (void)do_verification; return 0; } From 3e811ccf0479ddbcca9331aac6ace6778e784db6 Mon Sep 17 00:00:00 2001 From: rocking Date: Sun, 10 Apr 2022 18:57:17 +0000 Subject: [PATCH 03/28] Add device op for elementwise 2d --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 81 ++++++++---- .../gpu/device/device_elementwise.hpp | 30 +++++ .../gpu/device/device_elementwise_2d.hpp | 120 ++++++++++++++++++ 3 files changed, 208 insertions(+), 23 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index d11d14b79d4..a3124e1742d 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -19,6 +19,7 @@ #include "device_reduce_blockwise.hpp" #include "reduction_enums.hpp" #include "reduction_operator_mapping.hpp" +#include "device_elementwise_2d.hpp" template using S = ck::Sequence; @@ -115,6 +116,17 @@ using DeviceReduceInstance = 1, 1>; +struct Sub +{ + __host__ __device__ constexpr void operator()(F16& dst, const F16& src1, const F16& src2) const + { + dst = src1 - src2; + } +}; + +using DeviceElementwiseInstance = + ck::tensor_operation::device::DeviceElementwise_2D; + using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -184,6 +196,7 @@ int main(int argc, char* argv[]) Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_max(std::vector({static_cast(N)}), std::vector({1})); + Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); const auto i_inLengths = ck::to_int_vector(c_m_n.mDesc.GetLengths()); const auto i_inStrides = ck::to_int_vector(c_m_n.mDesc.GetStrides()); @@ -196,6 +209,7 @@ int main(int argc, char* argv[]) std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n.mDesc << std::endl; std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl; + std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; switch(init_method) { @@ -214,6 +228,7 @@ int main(int argc, char* argv[]) DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace()); DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace()); DeviceMem c_m_n_max_indices_dev(0); + DeviceMem d_m_n_device_buf(sizeof(CDataType) * d_m_n.mDesc.GetElementSpace()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -245,34 +260,54 @@ int main(int argc, char* argv[]) gemm_invoker.Run(gemm_argument, nrepeat); // do reduce max - auto reduce = DeviceReduceInstance{}; - auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); + auto reduce_max = DeviceReduceInstance{}; + auto wsSizeInBytes = reduce_max.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); DeviceMem ws_dev(wsSizeInBytes); - auto argument_ptr = - reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, - reduceDims, - 1, - 0, - c_m_n_device_buf.GetDeviceBuffer(), - c_m_n_max_device_buf.GetDeviceBuffer(), - c_m_n_max_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation{static_cast(reduce_total_length)}, - AccElementwiseOperation{static_cast(reduce_total_length)}); - - if(!reduce.IsSupportedArgument(argument_ptr.get())) + auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer( + i_inLengths, + i_inStrides, + i_outLengths, + i_outStrides, + reduceDims, + 1, + 0, + c_m_n_device_buf.GetDeviceBuffer(), + c_m_n_max_device_buf.GetDeviceBuffer(), + c_m_n_max_indices_dev.GetDeviceBuffer(), + ws_dev.GetDeviceBuffer(), + InElementwiseOperation{static_cast(reduce_total_length)}, + AccElementwiseOperation{static_cast(reduce_total_length)}); + + if(!reduce_max.IsSupportedArgument(reduce_max_argument_ptr.get())) { - std::cout - << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" - << std::endl; + throw std::runtime_error( + "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"); + }; + + auto reduce_max_invoker_ptr = reduce_max.MakeInvokerPointer(); + reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat); + + // do broadcast sub + auto broadcastSub = DeviceElementwiseInstance{}; + auto broadcastSub_argument_ptr = + broadcastSub.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(), + c_m_n_max_device_buf.GetDeviceBuffer(), + d_m_n_device_buf.GetDeviceBuffer(), + {M, N}, + {StrideC, 1}, + {0, 1}, + {StrideC, 1}, + Sub{}); + + if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get())) + { + throw std::runtime_error( + "The runtime parameters seems not supported by the DeviceElementwise_2D instance, exiting!"); }; - auto invoker_ptr = reduce.MakeInvokerPointer(); - invoker_ptr->Run(argument_ptr.get(), nrepeat); + auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer(); + broadcastSub_invoker_ptr->Run(broadcastSub_argument_ptr.get(), nrepeat); // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv // TODO = do_verification diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp new file mode 100644 index 00000000000..e79e1112256 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp @@ -0,0 +1,30 @@ +#pragma once +#include +#include + +#include "device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceElementwise : public BaseOperator +{ + + virtual std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + const std::vector& shape_a, + const std::vector& stride_a, + const std::vector& shape_b, + const std::vector& stride_b, + ElementwiseFunctor functor) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp new file mode 100644 index 00000000000..e060927ab18 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -0,0 +1,120 @@ +#pragma once +#include +#include + +#include "device.hpp" +#include "device_elementwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceElementwise_2D : public DeviceElementwise +{ + static auto Make2dDescriptor_M_N(const std::vector& shape, const std::vector& stride) + { + return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]), + make_tuple(stride[0], stride[1])); + } + + using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1})); + + struct Argument : public BaseArgument + { + Argument(const ADataType* p_a, + const BDataType* p_b, + CDataType* p_c, + const std::vector& shape, + const std::vector& stride_a, + const std::vector& stride_b, + const std::vector& stride_c, + ElementwiseFunctor functor) + : p_a_(p_a), + p_b_(p_b), + p_c_(p_c), + a_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_a)), + b_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_b)), + c_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_c)), + functor_(functor) + { + } + + const ADataType* p_a_; + const BDataType* p_b_; + CDataType* p_c_; + GridDesc_M_N a_grid_desc_m_n_; + GridDesc_M_N b_grid_desc_m_n_; + GridDesc_M_N c_grid_desc_m_n_; + ElementwiseFunctor functor_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, int nrepeat = 1) + { + // TODO + (void)arg; + (void)nrepeat; + return 0; + } + + float Run(const BaseArgument* p_arg, int nrepeat = 1) override + { + return Run(*dynamic_cast(p_arg), nrepeat); + }; + }; + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + // TODO: properly implement this check + const Argument* pArg = dynamic_cast(p_arg); + return pArg != nullptr; + }; + + std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + const std::vector& shape, + const std::vector& stride_a, + const std::vector& stride_b, + const std::vector& stride_c, + ElementwiseFunctor functor) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_c), + shape, + stride_a, + stride_b, + stride_c, + functor); + } + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceElementwise_2D" + << "<" + << BlockSize + << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck From 6818b58c93b3825b0fef98181ef131827523cd21 Mon Sep 17 00:00:00 2001 From: rocking Date: Sun, 10 Apr 2022 19:06:01 +0000 Subject: [PATCH 04/28] Fix compile error --- example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index a3124e1742d..cfa26184bff 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -86,9 +86,9 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle constexpr int Rank = 2; constexpr int NumReduceDim = 1; -constexpr ck::ReduceTensorOp_t ReduceOpId = ck::ReduceTensorOp_t::MAX; -constexpr ck::NanPropagation_t NanOpt = ck::NanPropagation_t::PROPAGATE_NAN; -constexpr bool PropagateNan = (NanOpt == ck::NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true; +constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX; +constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; +constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true; // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES; using ReduceOperation = typename ck::reduce_binary_operator::opType; using InElementwiseOperation = From e3a09b57558a3211ac6dc51ace5ca17696fb1dfe Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 11 Apr 2022 10:06:05 +0000 Subject: [PATCH 05/28] Add gridwise_elementwise_2d api --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 2 +- .../gpu/device/device_elementwise_2d.hpp | 93 +++++++++++++++++-- .../gpu/grid/gridwise_elementwise_2d.hpp | 82 ++++++++++++++++ 3 files changed, 169 insertions(+), 8 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index cfa26184bff..af8152e9d9f 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -125,7 +125,7 @@ struct Sub }; using DeviceElementwiseInstance = - ck::tensor_operation::device::DeviceElementwise_2D; + ck::tensor_operation::device::DeviceElementwise_2D; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp index e060927ab18..cabdcc6b5e8 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -4,6 +4,7 @@ #include "device.hpp" #include "device_elementwise.hpp" +#include "gridwise_elementwise_2d.hpp" namespace ck { namespace tensor_operation { @@ -12,17 +13,36 @@ namespace device { template + typename ElementwiseFunctor, + index_t MThreadPerBlock, + index_t NThreadPerBlock, + index_t MThreadTileSize, + index_t NThreadTileSize> struct DeviceElementwise_2D : public DeviceElementwise { + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static auto Make2dDescriptor_M_N(const std::vector& shape, const std::vector& stride) { return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]), make_tuple(stride[0], stride[1])); } - using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1})); + static constexpr index_t BlockSize = MThreadPerBlock * NThreadPerBlock; + static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize; + static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize; + + using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1})); + using GridwiseEltwise = GridwiseElementwise_2D; struct Argument : public BaseArgument { @@ -55,12 +75,63 @@ struct DeviceElementwise_2D : public DeviceElementwise struct Invoker : public BaseInvoker { + index_t CalculateGridSize(const GridDesc_M_N& grid_desc_m_n) + { + const auto M = grid_desc_m_n.GetLength(I0); + const auto N = grid_desc_m_n.GetLength(I1); + + assert(M % M_BlockTileSize == 0); + assert(N % N_BlockTileSize == 0); + + return (M / M_BlockTileSize) * (N / N_BlockTileSize); + } + float Run(const Argument& arg, int nrepeat = 1) { + const auto kernel = kernel_elementwise_2d; // TODO (void)arg; (void)nrepeat; - return 0; + (void)kernel; + float avgTime = 0; + const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_); + if(nrepeat == 0) + { + launch_kernel(kernel, + dim3(gridSize), + dim3(BlockSize), + 0, + arg.p_a_, + arg.p_b_, + arg.p_c_, + arg.a_grid_desc_m_n_, + arg.b_grid_desc_m_n_, + arg.c_grid_desc_m_n_, + arg.functor_); + } + else + { + avgTime = launch_and_time_kernel(kernel, + nrepeat, + dim3(gridSize), + dim3(BlockSize), + 0, + arg.p_a_, + arg.p_b_, + arg.p_c_, + arg.a_grid_desc_m_n_, + arg.b_grid_desc_m_n_, + arg.c_grid_desc_m_n_, + arg.functor_); + } + return avgTime; } float Run(const BaseArgument* p_arg, int nrepeat = 1) override @@ -71,9 +142,18 @@ struct DeviceElementwise_2D : public DeviceElementwise bool IsSupportedArgument(const BaseArgument* p_arg) override { - // TODO: properly implement this check const Argument* pArg = dynamic_cast(p_arg); - return pArg != nullptr; + + if(pArg == nullptr) + return false; + + const auto M = pArg->c_grid_desc_m_n_.GetLength(I0); + const auto N = pArg->c_grid_desc_m_n_.GetLength(I1); + + if(M % M_BlockTileSize != 0 && N % N_BlockTileSize != 0) + return false; + + return true; }; std::unique_ptr MakeArgumentPointer(const void* p_a, @@ -107,7 +187,6 @@ struct DeviceElementwise_2D : public DeviceElementwise // clang-format off str << "DeviceElementwise_2D" << "<" - << BlockSize << ">"; // clang-format on diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp new file mode 100644 index 00000000000..d8d55ee56fc --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include "cluster_descriptor.hpp" +#include "data_type.hpp" +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const AGridDesc_M_N a_grid_desc_m_k, + const BGridDesc_M_N b_grid_desc_m_k, + const CGridDesc_M_N c_grid_desc_m_k, + const ElementwiseFunctor functor) +{ + GridwiseEltwise::Run(p_a_global, + p_b_global, + p_c_global, + a_grid_desc_m_k, + b_grid_desc_m_k, + c_grid_desc_m_k, + functor); +} + +template +struct GridwiseElementwise_2D +{ + __device__ static void Run(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const AGridDesc_M_N a_grid_desc_m_n, + const BGridDesc_M_N b_grid_desc_m_n, + const CGridDesc_M_N c_grid_desc_m_n, + const ElementwiseFunctor functor) + { + // const index_t thread_id = get_thread_local_1d_id(); + // const index_t block_id = get_block_1d_id(); + // printf("block_id = %d, thread_id = %d \n", block_id, thread_id); + + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_grid_desc_m_n.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_grid_desc_m_n.GetElementSpaceSize()); + const auto c_global_buf = make_dynamic_buffer( + p_c_global, c_grid_desc_m_n.GetElementSpaceSize()); + + StaticBuffer + a_thread_buf; + StaticBuffer + b_thread_buf; + StaticBuffer + c_thread_buf; + + // TODO - buffer_load, apply functor, buffer_store + (void)a_global_buf; + (void)b_global_buf; + (void)c_global_buf; + (void)a_thread_buf; + (void)b_thread_buf; + (void)c_thread_buf; + (void)functor; + } +}; + +} // namespace ck From a760a73213f3352f97688e6eb4eaddcdba79f6e1 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 12 Apr 2022 19:55:43 +0000 Subject: [PATCH 06/28] A kernel of elementwise_2d (except global store) --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 14 +-- .../gpu/device/device_elementwise_2d.hpp | 29 +++-- .../gpu/grid/gridwise_elementwise_2d.hpp | 117 ++++++++++++++---- 3 files changed, 118 insertions(+), 42 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index af8152e9d9f..dd5c32ade61 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -84,8 +84,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -constexpr int Rank = 2; -constexpr int NumReduceDim = 1; +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX; constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true; @@ -118,14 +118,14 @@ using DeviceReduceInstance = struct Sub { - __host__ __device__ constexpr void operator()(F16& dst, const F16& src1, const F16& src2) const + __host__ __device__ constexpr void operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const { dst = src1 - src2; } }; -using DeviceElementwiseInstance = - ck::tensor_operation::device::DeviceElementwise_2D; +using DeviceElementwiseInstance = ck::tensor_operation::device:: + DeviceElementwise_2D; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -302,8 +302,8 @@ int main(int argc, char* argv[]) if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get())) { - throw std::runtime_error( - "The runtime parameters seems not supported by the DeviceElementwise_2D instance, exiting!"); + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceElementwise_2D instance, exiting!"); }; auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer(); diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp index cabdcc6b5e8..db9974b4a00 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -17,9 +17,17 @@ template + index_t NThreadTileSize, + index_t AThreadTransferSrcVectorDim, + index_t AThreadTransferSrcScalarPerVector, + index_t BThreadTransferSrcVectorDim, + index_t BThreadTransferSrcScalarPerVector, + index_t CThreadTransferSrcScalarPerVector> struct DeviceElementwise_2D : public DeviceElementwise { + static_assert(NThreadTileSize % AThreadTransferSrcScalarPerVector == 0 && + NThreadTileSize % BThreadTransferSrcScalarPerVector == 0); + static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -38,11 +46,16 @@ struct DeviceElementwise_2D : public DeviceElementwise BDataType, CDataType, GridDesc_M_N, - GridDesc_M_N, - GridDesc_M_N, ElementwiseFunctor, + MThreadPerBlock, + NThreadPerBlock, MThreadTileSize, - NThreadTileSize>; + NThreadTileSize, + AThreadTransferSrcVectorDim, + AThreadTransferSrcScalarPerVector, + BThreadTransferSrcVectorDim, + BThreadTransferSrcScalarPerVector, + CThreadTransferSrcScalarPerVector>; struct Argument : public BaseArgument { @@ -88,18 +101,12 @@ struct DeviceElementwise_2D : public DeviceElementwise float Run(const Argument& arg, int nrepeat = 1) { - const auto kernel = kernel_elementwise_2d; - // TODO - (void)arg; - (void)nrepeat; - (void)kernel; float avgTime = 0; const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_); if(nrepeat == 0) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp index d8d55ee56fc..97a6d0ab261 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp @@ -10,16 +10,14 @@ template __global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global, const BDataType* __restrict__ p_b_global, CDataType* __restrict__ p_c_global, - const AGridDesc_M_N a_grid_desc_m_k, - const BGridDesc_M_N b_grid_desc_m_k, - const CGridDesc_M_N c_grid_desc_m_k, + const GridDesc_M_N a_grid_desc_m_k, + const GridDesc_M_N b_grid_desc_m_k, + const GridDesc_M_N c_grid_desc_m_k, const ElementwiseFunctor functor) { GridwiseEltwise::Run(p_a_global, @@ -34,26 +32,58 @@ __global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global, template + index_t NThreadTileSize, + index_t AThreadTransferSrcVectorDim, + index_t AThreadTransferSrcScalarPerVector, + index_t BThreadTransferSrcVectorDim, + index_t BThreadTransferSrcScalarPerVector, + index_t CThreadTransferSrcScalarPerVector> struct GridwiseElementwise_2D { + static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + + using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize; + static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize; + + static __device__ __host__ auto CalculateElementwiseIndex(const GridDesc_M_N& grid_desc_m_n) + { + const index_t thread_id = get_thread_local_1d_id(); + const index_t block_id = get_block_1d_id(); + + const index_t M = grid_desc_m_n.GetLength(I0); + const index_t gridSize_m = M / M_BlockTileSize; + const index_t block_2d_idx_m = block_id % gridSize_m; + const index_t block_2d_idx_n = block_id / gridSize_m; + + constexpr auto thread_desc = + make_cluster_descriptor(Sequence{}, Sequence<1, 0>{}); + + const auto thread_2d_idx = thread_desc.CalculateBottomIndex(make_multi_index(thread_id)); + + return make_multi_index( + block_2d_idx_m * M_BlockTileSize + thread_2d_idx[I0] * MThreadTileSize, + block_2d_idx_n * N_BlockTileSize + thread_2d_idx[I1] * NThreadTileSize); + } + __device__ static void Run(const ADataType* __restrict__ p_a_global, const BDataType* __restrict__ p_b_global, CDataType* __restrict__ p_c_global, - const AGridDesc_M_N a_grid_desc_m_n, - const BGridDesc_M_N b_grid_desc_m_n, - const CGridDesc_M_N c_grid_desc_m_n, + const GridDesc_M_N a_grid_desc_m_n, + const GridDesc_M_N b_grid_desc_m_n, + const GridDesc_M_N c_grid_desc_m_n, const ElementwiseFunctor functor) { - // const index_t thread_id = get_thread_local_1d_id(); - // const index_t block_id = get_block_1d_id(); - // printf("block_id = %d, thread_id = %d \n", block_id, thread_id); - const auto a_global_buf = make_dynamic_buffer( p_a_global, a_grid_desc_m_n.GetElementSpaceSize()); const auto b_global_buf = make_dynamic_buffer( @@ -68,14 +98,53 @@ struct GridwiseElementwise_2D StaticBuffer c_thread_buf; - // TODO - buffer_load, apply functor, buffer_store - (void)a_global_buf; - (void)b_global_buf; + const auto a_global_load_offset = CalculateElementwiseIndex(a_grid_desc_m_n); + const auto b_global_load_offset = CalculateElementwiseIndex(b_grid_desc_m_n); + + auto a_global_load = ThreadwiseTensorSliceTransfer_v2< + ADataType, + ADataType, + GridDesc_M_N, + decltype(thread_buf_desc_M_N), + Sequence, // SliceLengths + Sequence<0, 1>, // DimAccessOrder + AThreadTransferSrcVectorDim, + AThreadTransferSrcScalarPerVector, + 1, // SrcScalarStrideInVector + false>{a_grid_desc_m_n, a_global_load_offset}; + + auto b_global_load = ThreadwiseTensorSliceTransfer_v2< + BDataType, + BDataType, + GridDesc_M_N, + decltype(thread_buf_desc_M_N), + Sequence, // SliceLengths + Sequence<0, 1>, // DimAccessOrder + BThreadTransferSrcVectorDim, + BThreadTransferSrcScalarPerVector, + 1, // SrcScalarStrideInVector + false>{b_grid_desc_m_n, b_global_load_offset}; + + a_global_load.Run( + a_grid_desc_m_n, a_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), a_thread_buf); + + b_global_load.Run( + b_grid_desc_m_n, b_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), b_thread_buf); + + static_for<0, MThreadTileSize, 1>{}([&](auto m) { + static_for<0, NThreadTileSize, 1>{}([&](auto n) { + constexpr auto offset = thread_buf_desc_M_N.CalculateOffset(make_tuple(m, n)); + functor(c_thread_buf(Number{}), + a_thread_buf(Number{}), + b_thread_buf(Number{})); + }); + }); + + // TODO - global write (void)c_global_buf; - (void)a_thread_buf; - (void)b_thread_buf; - (void)c_thread_buf; - (void)functor; + // c_global_write.Run( + // thread_buf_desc_M_N, c_thread_buf, c_grid_desc_m_n, make_tuple(I0, I0), + // c_global_buf); } }; From c8b4ac223637f5f8cb24d4e374ec8ef2a88155fd Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 13 Apr 2022 12:28:32 +0000 Subject: [PATCH 07/28] Add global write --- .../gpu/grid/gridwise_elementwise_2d.hpp | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp index 97a6d0ab261..713411c63a4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp @@ -2,6 +2,7 @@ #include "cluster_descriptor.hpp" #include "data_type.hpp" +#include "element_wise_operation.hpp" #include "threadwise_tensor_slice_transfer.hpp" namespace ck { @@ -48,6 +49,7 @@ struct GridwiseElementwise_2D static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{})); + using PassThrough = tensor_operation::element_wise::PassThrough; using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N); static constexpr auto I0 = Number<0>{}; @@ -88,7 +90,7 @@ struct GridwiseElementwise_2D p_a_global, a_grid_desc_m_n.GetElementSpaceSize()); const auto b_global_buf = make_dynamic_buffer( p_b_global, b_grid_desc_m_n.GetElementSpaceSize()); - const auto c_global_buf = make_dynamic_buffer( + auto c_global_buf = make_dynamic_buffer( p_c_global, c_grid_desc_m_n.GetElementSpaceSize()); StaticBuffer @@ -141,10 +143,23 @@ struct GridwiseElementwise_2D }); // TODO - global write - (void)c_global_buf; - // c_global_write.Run( - // thread_buf_desc_M_N, c_thread_buf, c_grid_desc_m_n, make_tuple(I0, I0), - // c_global_buf); + const auto c_global_write_offset = CalculateElementwiseIndex(c_grid_desc_m_n); + auto c_global_write = ThreadwiseTensorSliceTransfer_v1r3< + CDataType, + CDataType, + decltype(thread_buf_desc_M_N), + GridDesc_M_N, + PassThrough, + Sequence, // SliceLengths + Sequence<0, 1>, // DimAccessOrder + 1, // DstVectorDim + CThreadTransferSrcScalarPerVector, // DstScalarPerVector + InMemoryDataOperationEnum::Set, // DstInMemOp + 1, // DstScalarStrideInVector + false>{c_grid_desc_m_n, c_global_write_offset, PassThrough{}}; + + c_global_write.Run( + thread_buf_desc_M_N, make_tuple(I0, I0), c_thread_buf, c_grid_desc_m_n, c_global_buf); } }; From f2540aa5b4af1658e28607150ae285ef9736df43 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 13 Apr 2022 12:56:00 +0000 Subject: [PATCH 08/28] Add exponential --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index dd5c32ade61..eabc03e23e0 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" @@ -116,16 +117,19 @@ using DeviceReduceInstance = 1, 1>; -struct Sub +struct Sub_Exp { __host__ __device__ constexpr void operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const { dst = src1 - src2; + // FIXME - use float16 exponential + float dst_f32 = static_cast(dst); + dst = static_cast(exp(dst_f32)); } }; using DeviceElementwiseInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; + DeviceElementwise_2D; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -289,25 +293,25 @@ int main(int argc, char* argv[]) reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat); // do broadcast sub - auto broadcastSub = DeviceElementwiseInstance{}; - auto broadcastSub_argument_ptr = - broadcastSub.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(), + auto broadcastSubExp = DeviceElementwiseInstance{}; + auto broadcastSubExp_argument_ptr = + broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(), c_m_n_max_device_buf.GetDeviceBuffer(), d_m_n_device_buf.GetDeviceBuffer(), {M, N}, {StrideC, 1}, {0, 1}, {StrideC, 1}, - Sub{}); + Sub_Exp{}); - if(!broadcastSub.IsSupportedArgument(broadcastSub_argument_ptr.get())) + if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get())) { throw std::runtime_error("The runtime parameters seems not supported by the " "DeviceElementwise_2D instance, exiting!"); }; - auto broadcastSub_invoker_ptr = broadcastSub.MakeInvokerPointer(); - broadcastSub_invoker_ptr->Run(broadcastSub_argument_ptr.get(), nrepeat); + auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer(); + broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat); // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv // TODO = do_verification From 30348daad693624a6056c559085a50fa18cdf090 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 13 Apr 2022 12:59:36 +0000 Subject: [PATCH 09/28] [What] Refine naming [Why] Prepare to add reduceSum --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index eabc03e23e0..2313c7af6f5 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -85,25 +85,25 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -constexpr int Rank = 2; +constexpr int ReduceRank = 2; constexpr int NumReduceDim = 1; -constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX; +constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX; constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true; // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES; -using ReduceOperation = typename ck::reduce_binary_operator::opType; +using ReduceMaxOp = typename ck::reduce_binary_operator::opType; using InElementwiseOperation = - typename ck::reduce_unary_operator::InElementwiseOperation; + typename ck::reduce_unary_operator::InElementwiseOperation; using AccElementwiseOperation = - typename ck::reduce_unary_operator::AccElementwiseOperation; + typename ck::reduce_unary_operator::AccElementwiseOperation; -using DeviceReduceInstance = +using DeviceReduceMaxInstance = ck::tensor_operation::device::DeviceReduceBlockWise Date: Wed, 13 Apr 2022 19:52:38 +0000 Subject: [PATCH 10/28] Add reduce sum for denominator of softmax --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 146 ++++++++++++------ 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 2313c7af6f5..0e86e55c1b4 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -85,27 +85,53 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -constexpr int ReduceRank = 2; -constexpr int NumReduceDim = 1; +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX; -constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; +constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD; +constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true; // constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES; using ReduceMaxOp = typename ck::reduce_binary_operator::opType; -using InElementwiseOperation = +using ReduceSumOp = typename ck::reduce_binary_operator::opType; +using ReduceMaxInElementwiseOperation = typename ck::reduce_unary_operator::InElementwiseOperation; -using AccElementwiseOperation = +using ReduceMaxAccElementwiseOperation = typename ck::reduce_unary_operator::AccElementwiseOperation; +using ReduceSumInElementwiseOperation = + typename ck::reduce_unary_operator::InElementwiseOperation; +using ReduceSumAccElementwiseOperation = + typename ck::reduce_unary_operator::AccElementwiseOperation; using DeviceReduceMaxInstance = ck::tensor_operation::device::DeviceReduceBlockWise; + +using DeviceReduceSumInstance = + ck::tensor_operation::device::DeviceReduceBlockWise(dst); - dst = static_cast(exp(dst_f32)); + dst = static_cast(exp(dst_f32)); } }; @@ -198,22 +225,25 @@ int main(int argc, char* argv[]) Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c_m_n_max(std::vector({static_cast(N)}), - std::vector({1})); - Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_n_max(std::vector({static_cast(N)}), + std::vector({1})); + Tensor exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor exp_n_sum(std::vector({static_cast(N)}), + std::vector({1})); - const auto i_inLengths = ck::to_int_vector(c_m_n.mDesc.GetLengths()); - const auto i_inStrides = ck::to_int_vector(c_m_n.mDesc.GetStrides()); - const auto i_outLengths = ck::to_int_vector(c_m_n_max.mDesc.GetLengths()); - const auto i_outStrides = ck::to_int_vector(c_m_n_max.mDesc.GetStrides()); + const auto c_m_n_shape = ck::to_int_vector(c_m_n.mDesc.GetLengths()); + const auto c_m_n_stride = ck::to_int_vector(c_m_n.mDesc.GetStrides()); + const auto reduce_n_shape = ck::to_int_vector(c_n_max.mDesc.GetLengths()); + const auto reduce_n_stride = ck::to_int_vector(c_n_max.mDesc.GetStrides()); - size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_m_n_max.mDesc.GetElementSize(); + size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_n_max.mDesc.GetElementSize(); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n.mDesc << std::endl; - std::cout << "c_m_n_max: " << c_m_n_max.mDesc << std::endl; - std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; + std::cout << "c_n_max: " << c_n_max.mDesc << std::endl; + std::cout << "exp_m_n: " << exp_m_n.mDesc << std::endl; + std::cout << "exp_n_sum: " << exp_n_sum.mDesc << std::endl; switch(init_method) { @@ -230,9 +260,10 @@ int main(int argc, char* argv[]) DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace()); - DeviceMem c_m_n_max_device_buf(sizeof(CDataType) * c_m_n_max.mDesc.GetElementSpace()); - DeviceMem c_m_n_max_indices_dev(0); - DeviceMem d_m_n_device_buf(sizeof(CDataType) * d_m_n.mDesc.GetElementSpace()); + DeviceMem c_n_max_device_buf(sizeof(CDataType) * c_n_max.mDesc.GetElementSpace()); + DeviceMem indices_device_buf(0); + DeviceMem exp_m_n_device_buf(sizeof(CDataType) * exp_m_n.mDesc.GetElementSpace()); + DeviceMem exp_n_sum_device_buf(sizeof(CDataType) * exp_n_sum.mDesc.GetElementSpace()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -265,23 +296,23 @@ int main(int argc, char* argv[]) // do reduce max auto reduce_max = DeviceReduceMaxInstance{}; - auto wsSizeInBytes = reduce_max.GetWorkspaceSizeInBytes(i_inLengths, reduceDims); - DeviceMem ws_dev(wsSizeInBytes); + auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); + DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size); auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer( - i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + c_m_n_shape, + c_m_n_stride, + reduce_n_shape, + reduce_n_stride, reduceDims, 1, 0, c_m_n_device_buf.GetDeviceBuffer(), - c_m_n_max_device_buf.GetDeviceBuffer(), - c_m_n_max_indices_dev.GetDeviceBuffer(), - ws_dev.GetDeviceBuffer(), - InElementwiseOperation{static_cast(reduce_total_length)}, - AccElementwiseOperation{static_cast(reduce_total_length)}); + c_n_max_device_buf.GetDeviceBuffer(), + indices_device_buf.GetDeviceBuffer(), + reduce_max_workaspace_device_buf.GetDeviceBuffer(), + ReduceMaxInElementwiseOperation{static_cast(reduce_total_length)}, + ReduceMaxAccElementwiseOperation{static_cast(reduce_total_length)}); if(!reduce_max.IsSupportedArgument(reduce_max_argument_ptr.get())) { @@ -292,17 +323,17 @@ int main(int argc, char* argv[]) auto reduce_max_invoker_ptr = reduce_max.MakeInvokerPointer(); reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat); - // do broadcast sub + // do broadcast sub and exp auto broadcastSubExp = DeviceElementwiseInstance{}; auto broadcastSubExp_argument_ptr = broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(), - c_m_n_max_device_buf.GetDeviceBuffer(), - d_m_n_device_buf.GetDeviceBuffer(), - {M, N}, - {StrideC, 1}, - {0, 1}, - {StrideC, 1}, - Sub_Exp{}); + c_n_max_device_buf.GetDeviceBuffer(), + exp_m_n_device_buf.GetDeviceBuffer(), + {M, N}, + {StrideC, 1}, + {0, 1}, + {StrideC, 1}, + Sub_Exp{}); if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get())) { @@ -313,7 +344,36 @@ int main(int argc, char* argv[]) auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer(); broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat); - // TODO - Need BroadcastSub + exponential + ReduceSum + BroadcastDiv + // do reduce sum - denominator of softmax + auto reduce_sum = DeviceReduceSumInstance{}; + auto reduce_sum_workaspace_size = reduce_sum.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); + DeviceMem reduce_sum_workaspace_device_buf(reduce_sum_workaspace_size); + + auto reduce_sum_argument_ptr = reduce_sum.MakeArgumentPointer( + c_m_n_shape, + c_m_n_stride, + reduce_n_shape, + reduce_n_stride, + reduceDims, + 1, + 0, + exp_m_n_device_buf.GetDeviceBuffer(), + exp_n_sum_device_buf.GetDeviceBuffer(), + indices_device_buf.GetDeviceBuffer(), + reduce_sum_workaspace_device_buf.GetDeviceBuffer(), + ReduceSumInElementwiseOperation{static_cast(reduce_total_length)}, + ReduceSumAccElementwiseOperation{static_cast(reduce_total_length)}); + + if(!reduce_sum.IsSupportedArgument(reduce_sum_argument_ptr.get())) + { + throw std::runtime_error( + "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"); + }; + + auto reduce_sum_invoker_ptr = reduce_sum.MakeInvokerPointer(); + reduce_sum_invoker_ptr->Run(reduce_sum_argument_ptr.get(), nrepeat); + + // TODO - Need BroadcastDiv // TODO = do_verification (void)do_verification; return 0; From 6a781e517640de05eb63b2f269585ff4c7b87a39 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 13 Apr 2022 20:04:08 +0000 Subject: [PATCH 11/28] Add broadcast div, the final step of softmax --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 51 ++++++++++++++++--- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 0e86e55c1b4..f11205f6437 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -155,9 +155,21 @@ struct Sub_Exp } }; -using DeviceElementwiseInstance = ck::tensor_operation::device:: +struct Div +{ + __host__ __device__ constexpr void + operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const + { + dst = src1 / src2; + } +}; + +using DeviceElementwiseSubExpInstance = ck::tensor_operation::device:: DeviceElementwise_2D; +using DeviceElementwiseDivInstance = ck::tensor_operation::device:: + DeviceElementwise_2D; + using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -230,10 +242,11 @@ int main(int argc, char* argv[]) Tensor exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor exp_n_sum(std::vector({static_cast(N)}), std::vector({1})); + Tensor softmax_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - const auto c_m_n_shape = ck::to_int_vector(c_m_n.mDesc.GetLengths()); - const auto c_m_n_stride = ck::to_int_vector(c_m_n.mDesc.GetStrides()); - const auto reduce_n_shape = ck::to_int_vector(c_n_max.mDesc.GetLengths()); + const auto c_m_n_shape = ck::to_int_vector(c_m_n.mDesc.GetLengths()); + const auto c_m_n_stride = ck::to_int_vector(c_m_n.mDesc.GetStrides()); + const auto reduce_n_shape = ck::to_int_vector(c_n_max.mDesc.GetLengths()); const auto reduce_n_stride = ck::to_int_vector(c_n_max.mDesc.GetStrides()); size_t reduce_total_length = c_m_n.mDesc.GetElementSize() / c_n_max.mDesc.GetElementSize(); @@ -244,6 +257,7 @@ int main(int argc, char* argv[]) std::cout << "c_n_max: " << c_n_max.mDesc << std::endl; std::cout << "exp_m_n: " << exp_m_n.mDesc << std::endl; std::cout << "exp_n_sum: " << exp_n_sum.mDesc << std::endl; + std::cout << "softmax_m_n: " << softmax_m_n.mDesc << std::endl; switch(init_method) { @@ -264,6 +278,7 @@ int main(int argc, char* argv[]) DeviceMem indices_device_buf(0); DeviceMem exp_m_n_device_buf(sizeof(CDataType) * exp_m_n.mDesc.GetElementSpace()); DeviceMem exp_n_sum_device_buf(sizeof(CDataType) * exp_n_sum.mDesc.GetElementSpace()); + DeviceMem softmax_m_n_device_buf(sizeof(CDataType) * softmax_m_n.mDesc.GetElementSpace()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -295,7 +310,7 @@ int main(int argc, char* argv[]) gemm_invoker.Run(gemm_argument, nrepeat); // do reduce max - auto reduce_max = DeviceReduceMaxInstance{}; + auto reduce_max = DeviceReduceMaxInstance{}; auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size); @@ -324,7 +339,7 @@ int main(int argc, char* argv[]) reduce_max_invoker_ptr->Run(reduce_max_argument_ptr.get(), nrepeat); // do broadcast sub and exp - auto broadcastSubExp = DeviceElementwiseInstance{}; + auto broadcastSubExp = DeviceElementwiseSubExpInstance{}; auto broadcastSubExp_argument_ptr = broadcastSubExp.MakeArgumentPointer(c_m_n_device_buf.GetDeviceBuffer(), c_n_max_device_buf.GetDeviceBuffer(), @@ -345,7 +360,7 @@ int main(int argc, char* argv[]) broadcastSubExp_invoker_ptr->Run(broadcastSubExp_argument_ptr.get(), nrepeat); // do reduce sum - denominator of softmax - auto reduce_sum = DeviceReduceSumInstance{}; + auto reduce_sum = DeviceReduceSumInstance{}; auto reduce_sum_workaspace_size = reduce_sum.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); DeviceMem reduce_sum_workaspace_device_buf(reduce_sum_workaspace_size); @@ -373,7 +388,27 @@ int main(int argc, char* argv[]) auto reduce_sum_invoker_ptr = reduce_sum.MakeInvokerPointer(); reduce_sum_invoker_ptr->Run(reduce_sum_argument_ptr.get(), nrepeat); - // TODO - Need BroadcastDiv + // do broadcast div + auto broadcastDiv = DeviceElementwiseDivInstance{}; + auto broadcastDiv_argument_ptr = + broadcastDiv.MakeArgumentPointer(exp_m_n_device_buf.GetDeviceBuffer(), + exp_n_sum_device_buf.GetDeviceBuffer(), + softmax_m_n_device_buf.GetDeviceBuffer(), + {M, N}, + {StrideC, 1}, + {0, 1}, + {StrideC, 1}, + Div{}); + + if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get())) + { + throw std::runtime_error("The runtime parameters seems not supported by the " + "DeviceElementwise_2D instance, exiting!"); + }; + + auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer(); + broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat); + // TODO = do_verification (void)do_verification; return 0; From dba65b1c71197d63688c75fb0290142b7a0f30e4 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 14 Apr 2022 19:18:34 +0000 Subject: [PATCH 12/28] Rewrite the gridwise_elementwise_ 2d as 1d version --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 4 +- .../gpu/device/device_elementwise_2d.hpp | 111 ++++++------ .../gpu/grid/gridwise_elementwise_1d.hpp | 149 ++++++++++++++++ .../gpu/grid/gridwise_elementwise_2d.hpp | 166 ------------------ 4 files changed, 200 insertions(+), 230 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index f11205f6437..9f2cbe1e625 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -165,10 +165,10 @@ struct Div }; using DeviceElementwiseSubExpInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; + DeviceElementwise_2D; using DeviceElementwiseDivInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; + DeviceElementwise_2D; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp index db9974b4a00..182929f63cd 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -4,7 +4,7 @@ #include "device.hpp" #include "device_elementwise.hpp" -#include "gridwise_elementwise_2d.hpp" +#include "gridwise_elementwise_1d.hpp" namespace ck { namespace tensor_operation { @@ -14,48 +14,40 @@ template + index_t ThreadPerBlock, + index_t ThreadTileSize, + index_t ScalarPerVector> struct DeviceElementwise_2D : public DeviceElementwise { - static_assert(NThreadTileSize % AThreadTransferSrcScalarPerVector == 0 && - NThreadTileSize % BThreadTransferSrcScalarPerVector == 0); + static_assert(ThreadTileSize % ScalarPerVector == 0); + static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; + static constexpr auto I0 = Number<0>{}; - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static auto Make2dDescriptor_M_N(const std::vector& shape, const std::vector& stride) + static auto MakeDescriptor_M0(const std::vector& shape, const std::vector& stride) { - return make_naive_tensor_descriptor(make_tuple(shape[0], shape[1]), - make_tuple(stride[0], stride[1])); + const int m = shape[0]; + const int n = shape[1]; + + // 2d desc - [m, n] + const auto desc_m_n = + make_naive_tensor_descriptor(make_tuple(m, n), make_tuple(stride[0], stride[1])); + + // 1d desc - [m * n] + return transform_tensor_descriptor(desc_m_n, + make_tuple(make_merge_transform(make_tuple(m, n))), + make_tuple(Sequence<0, 1>{}), + make_tuple(Sequence<0>{})); } - static constexpr index_t BlockSize = MThreadPerBlock * NThreadPerBlock; - static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize; - static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize; - - using GridDesc_M_N = decltype(Make2dDescriptor_M_N({1, 1}, {1, 1})); - using GridwiseEltwise = GridwiseElementwise_2D; + ThreadPerBlock, + ThreadTileSize, + ScalarPerVector>; struct Argument : public BaseArgument { @@ -70,9 +62,9 @@ struct DeviceElementwise_2D : public DeviceElementwise : p_a_(p_a), p_b_(p_b), p_c_(p_c), - a_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_a)), - b_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_b)), - c_grid_desc_m_n_(Make2dDescriptor_M_N(shape, stride_c)), + a_grid_desc_m0_(MakeDescriptor_M0(shape, stride_a)), + b_grid_desc_m0_(MakeDescriptor_M0(shape, stride_b)), + c_grid_desc_m0_(MakeDescriptor_M0(shape, stride_c)), functor_(functor) { } @@ -80,47 +72,42 @@ struct DeviceElementwise_2D : public DeviceElementwise const ADataType* p_a_; const BDataType* p_b_; CDataType* p_c_; - GridDesc_M_N a_grid_desc_m_n_; - GridDesc_M_N b_grid_desc_m_n_; - GridDesc_M_N c_grid_desc_m_n_; + GridDesc_M0 a_grid_desc_m0_; + GridDesc_M0 b_grid_desc_m0_; + GridDesc_M0 c_grid_desc_m0_; ElementwiseFunctor functor_; }; struct Invoker : public BaseInvoker { - index_t CalculateGridSize(const GridDesc_M_N& grid_desc_m_n) + index_t CalculateGridSize(const GridDesc_M0& grid_desc_m0) { - const auto M = grid_desc_m_n.GetLength(I0); - const auto N = grid_desc_m_n.GetLength(I1); - - assert(M % M_BlockTileSize == 0); - assert(N % N_BlockTileSize == 0); - - return (M / M_BlockTileSize) * (N / N_BlockTileSize); + const auto gridTileSize = grid_desc_m0.GetLength(I0); + return gridTileSize / BlockTileSize; } float Run(const Argument& arg, int nrepeat = 1) { - const auto kernel = kernel_elementwise_2d; float avgTime = 0; - const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m_n_); + const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m0_); if(nrepeat == 0) { launch_kernel(kernel, dim3(gridSize), - dim3(BlockSize), + dim3(ThreadPerBlock), 0, arg.p_a_, arg.p_b_, arg.p_c_, - arg.a_grid_desc_m_n_, - arg.b_grid_desc_m_n_, - arg.c_grid_desc_m_n_, + arg.a_grid_desc_m0_, + arg.b_grid_desc_m0_, + arg.c_grid_desc_m0_, arg.functor_); } else @@ -128,14 +115,14 @@ struct DeviceElementwise_2D : public DeviceElementwise avgTime = launch_and_time_kernel(kernel, nrepeat, dim3(gridSize), - dim3(BlockSize), + dim3(ThreadPerBlock), 0, arg.p_a_, arg.p_b_, arg.p_c_, - arg.a_grid_desc_m_n_, - arg.b_grid_desc_m_n_, - arg.c_grid_desc_m_n_, + arg.a_grid_desc_m0_, + arg.b_grid_desc_m0_, + arg.c_grid_desc_m0_, arg.functor_); } return avgTime; @@ -154,10 +141,10 @@ struct DeviceElementwise_2D : public DeviceElementwise if(pArg == nullptr) return false; - const auto M = pArg->c_grid_desc_m_n_.GetLength(I0); - const auto N = pArg->c_grid_desc_m_n_.GetLength(I1); + // m * n + const auto m0 = pArg->c_grid_desc_m0_.GetLength(I0); - if(M % M_BlockTileSize != 0 && N % N_BlockTileSize != 0) + if(m0 % BlockTileSize != 0) return false; return true; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp new file mode 100644 index 00000000000..cdd86e0e950 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp @@ -0,0 +1,149 @@ +#pragma once + +#include "cluster_descriptor.hpp" +#include "data_type.hpp" +#include "element_wise_operation.hpp" +#include "threadwise_tensor_slice_transfer.hpp" + +namespace ck { + +template +__global__ void kernel_elementwise_1d(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const GridDesc_M0 a_grid_desc_m0, + const GridDesc_M0 b_grid_desc_m0, + const GridDesc_M0 c_grid_desc_m0, + const ElementwiseFunctor functor) +{ + GridwiseEltwise::Run(p_a_global, + p_b_global, + p_c_global, + a_grid_desc_m0, + b_grid_desc_m0, + c_grid_desc_m0, + functor); +} + +template +struct GridwiseElementwise_1D +{ + static constexpr auto I0 = Number<0>{}; + static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; + static constexpr auto thread_desc_M0 = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + using PassThrough = tensor_operation::element_wise::PassThrough; + + static __device__ __host__ auto CalculateElementwiseIndex() + { + const index_t thread_id = get_thread_local_1d_id(); + const index_t block_id = get_block_1d_id(); + + return make_multi_index(block_id * BlockTileSize + thread_id * ScalarPerVector); + } + + __device__ static void Run(const ADataType* __restrict__ p_a_global, + const BDataType* __restrict__ p_b_global, + CDataType* __restrict__ p_c_global, + const GridDesc_M0 a_grid_desc_m0, + const GridDesc_M0 b_grid_desc_m0, + const GridDesc_M0 c_grid_desc_m0, + const ElementwiseFunctor functor) + { + const auto a_global_buf = make_dynamic_buffer( + p_a_global, a_grid_desc_m0.GetElementSpaceSize()); + const auto b_global_buf = make_dynamic_buffer( + p_b_global, b_grid_desc_m0.GetElementSpaceSize()); + auto c_global_buf = make_dynamic_buffer( + p_c_global, c_grid_desc_m0.GetElementSpaceSize()); + + StaticBuffer a_thread_buf; + StaticBuffer b_thread_buf; + StaticBuffer c_thread_buf; + + const auto thread_to_global_offset = CalculateElementwiseIndex(); + + auto a_global_load = + ThreadwiseTensorSliceTransfer_v2, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + ScalarPerVector, + 1, // SrcScalarStrideInVector + false>{a_grid_desc_m0, thread_to_global_offset}; + + auto b_global_load = + ThreadwiseTensorSliceTransfer_v2, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // SrcVectorDim + ScalarPerVector, + 1, // SrcScalarStrideInVector + false>{b_grid_desc_m0, thread_to_global_offset}; + + auto c_global_write = + ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths + Sequence<0>, // DimAccessOrder + 0, // DstVectorDim + ScalarPerVector, + InMemoryDataOperationEnum::Set, + 1, // DstScalarStrideInVector + false>{ + c_grid_desc_m0, thread_to_global_offset, PassThrough{}}; + + int num_iter = ThreadTileSize / ScalarPerVector; + constexpr auto thread_to_global_step = make_multi_index(ThreadPerBlock * ScalarPerVector); + do + { + // read and process ScalarPerVector elements + a_global_load.Run( + a_grid_desc_m0, a_global_buf, thread_desc_M0, make_tuple(I0), a_thread_buf); + + b_global_load.Run( + b_grid_desc_m0, b_global_buf, thread_desc_M0, make_tuple(I0), b_thread_buf); + + static_for<0, ScalarPerVector, 1>{}([&](auto m) { + constexpr auto offset = thread_desc_M0.CalculateOffset(make_tuple(m)); + functor(c_thread_buf(Number{}), + a_thread_buf(Number{}), + b_thread_buf(Number{})); + }); + + c_global_write.Run(thread_desc_M0, + make_tuple(I0), // SrcSliceOriginIdx + c_thread_buf, + c_grid_desc_m0, + c_global_buf); + + a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, thread_to_global_step); + b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, thread_to_global_step); + c_global_write.MoveDstSliceWindow(c_grid_desc_m0, thread_to_global_step); + } while(--num_iter); + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp deleted file mode 100644 index 713411c63a4..00000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp +++ /dev/null @@ -1,166 +0,0 @@ -#pragma once - -#include "cluster_descriptor.hpp" -#include "data_type.hpp" -#include "element_wise_operation.hpp" -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -__global__ void kernel_elementwise_2d(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - CDataType* __restrict__ p_c_global, - const GridDesc_M_N a_grid_desc_m_k, - const GridDesc_M_N b_grid_desc_m_k, - const GridDesc_M_N c_grid_desc_m_k, - const ElementwiseFunctor functor) -{ - GridwiseEltwise::Run(p_a_global, - p_b_global, - p_c_global, - a_grid_desc_m_k, - b_grid_desc_m_k, - c_grid_desc_m_k, - functor); -} - -template -struct GridwiseElementwise_2D -{ - static constexpr auto thread_buf_desc_M_N = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - using PassThrough = tensor_operation::element_wise::PassThrough; - using ThreadBufDesc_M_N = decltype(thread_buf_desc_M_N); - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr int M_BlockTileSize = MThreadPerBlock * MThreadTileSize; - static constexpr int N_BlockTileSize = NThreadPerBlock * NThreadTileSize; - - static __device__ __host__ auto CalculateElementwiseIndex(const GridDesc_M_N& grid_desc_m_n) - { - const index_t thread_id = get_thread_local_1d_id(); - const index_t block_id = get_block_1d_id(); - - const index_t M = grid_desc_m_n.GetLength(I0); - const index_t gridSize_m = M / M_BlockTileSize; - const index_t block_2d_idx_m = block_id % gridSize_m; - const index_t block_2d_idx_n = block_id / gridSize_m; - - constexpr auto thread_desc = - make_cluster_descriptor(Sequence{}, Sequence<1, 0>{}); - - const auto thread_2d_idx = thread_desc.CalculateBottomIndex(make_multi_index(thread_id)); - - return make_multi_index( - block_2d_idx_m * M_BlockTileSize + thread_2d_idx[I0] * MThreadTileSize, - block_2d_idx_n * N_BlockTileSize + thread_2d_idx[I1] * NThreadTileSize); - } - - __device__ static void Run(const ADataType* __restrict__ p_a_global, - const BDataType* __restrict__ p_b_global, - CDataType* __restrict__ p_c_global, - const GridDesc_M_N a_grid_desc_m_n, - const GridDesc_M_N b_grid_desc_m_n, - const GridDesc_M_N c_grid_desc_m_n, - const ElementwiseFunctor functor) - { - const auto a_global_buf = make_dynamic_buffer( - p_a_global, a_grid_desc_m_n.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( - p_b_global, b_grid_desc_m_n.GetElementSpaceSize()); - auto c_global_buf = make_dynamic_buffer( - p_c_global, c_grid_desc_m_n.GetElementSpaceSize()); - - StaticBuffer - a_thread_buf; - StaticBuffer - b_thread_buf; - StaticBuffer - c_thread_buf; - - const auto a_global_load_offset = CalculateElementwiseIndex(a_grid_desc_m_n); - const auto b_global_load_offset = CalculateElementwiseIndex(b_grid_desc_m_n); - - auto a_global_load = ThreadwiseTensorSliceTransfer_v2< - ADataType, - ADataType, - GridDesc_M_N, - decltype(thread_buf_desc_M_N), - Sequence, // SliceLengths - Sequence<0, 1>, // DimAccessOrder - AThreadTransferSrcVectorDim, - AThreadTransferSrcScalarPerVector, - 1, // SrcScalarStrideInVector - false>{a_grid_desc_m_n, a_global_load_offset}; - - auto b_global_load = ThreadwiseTensorSliceTransfer_v2< - BDataType, - BDataType, - GridDesc_M_N, - decltype(thread_buf_desc_M_N), - Sequence, // SliceLengths - Sequence<0, 1>, // DimAccessOrder - BThreadTransferSrcVectorDim, - BThreadTransferSrcScalarPerVector, - 1, // SrcScalarStrideInVector - false>{b_grid_desc_m_n, b_global_load_offset}; - - a_global_load.Run( - a_grid_desc_m_n, a_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), a_thread_buf); - - b_global_load.Run( - b_grid_desc_m_n, b_global_buf, thread_buf_desc_M_N, make_tuple(I0, I0), b_thread_buf); - - static_for<0, MThreadTileSize, 1>{}([&](auto m) { - static_for<0, NThreadTileSize, 1>{}([&](auto n) { - constexpr auto offset = thread_buf_desc_M_N.CalculateOffset(make_tuple(m, n)); - functor(c_thread_buf(Number{}), - a_thread_buf(Number{}), - b_thread_buf(Number{})); - }); - }); - - // TODO - global write - const auto c_global_write_offset = CalculateElementwiseIndex(c_grid_desc_m_n); - auto c_global_write = ThreadwiseTensorSliceTransfer_v1r3< - CDataType, - CDataType, - decltype(thread_buf_desc_M_N), - GridDesc_M_N, - PassThrough, - Sequence, // SliceLengths - Sequence<0, 1>, // DimAccessOrder - 1, // DstVectorDim - CThreadTransferSrcScalarPerVector, // DstScalarPerVector - InMemoryDataOperationEnum::Set, // DstInMemOp - 1, // DstScalarStrideInVector - false>{c_grid_desc_m_n, c_global_write_offset, PassThrough{}}; - - c_global_write.Run( - thread_buf_desc_M_N, make_tuple(I0, I0), c_thread_buf, c_grid_desc_m_n, c_global_buf); - } -}; - -} // namespace ck From fe6595020b0ce31f7987956119e2427079d1b336 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 15 Apr 2022 12:29:04 +0000 Subject: [PATCH 13/28] Add verication of softmax --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 122 ++++++++++++++++-- 1 file changed, 110 insertions(+), 12 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 9f2cbe1e625..922c914f928 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -5,11 +5,14 @@ #include #include #include +#include "check_err.hpp" #include "config.hpp" #include "device.hpp" #include "host_tensor.hpp" #include "host_tensor_generator.hpp" #include "host_reduce_util.hpp" +#include "host_reduction.hpp" + #include "device_tensor.hpp" #include "device_gemm_xdl.hpp" #include "device_gemm_xdl_c_shuffle.hpp" @@ -89,9 +92,7 @@ constexpr int Rank = 2; constexpr int NumReduceDim = 1; constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX; constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD; -constexpr ck::NanPropagation NanOpt = ck::NanPropagation::PROPAGATE_NAN; -constexpr bool PropagateNan = (NanOpt == ck::NanPropagation::NOT_PROPAGATE_NAN) ? false : true; -// constexpr ck::ReduceTensorIndices_t IndicesOpt = ck::ReduceTensorIndices_t::NO_INDICES; +constexpr bool ReducePropagateNan = false; using ReduceMaxOp = typename ck::reduce_binary_operator::opType; using ReduceSumOp = typename ck::reduce_binary_operator::opType; using ReduceMaxInElementwiseOperation = @@ -112,7 +113,7 @@ using DeviceReduceMaxInstance = ReduceMaxOp, ReduceMaxInElementwiseOperation, ReduceMaxAccElementwiseOperation, - PropagateNan, + ReducePropagateNan, false, 256, 4, @@ -132,7 +133,7 @@ using DeviceReduceSumInstance = ReduceSumOp, ReduceSumInElementwiseOperation, ReduceSumAccElementwiseOperation, - PropagateNan, + ReducePropagateNan, false, 256, 4, @@ -170,9 +171,47 @@ using DeviceElementwiseSubExpInstance = ck::tensor_operation::device:: using DeviceElementwiseDivInstance = ck::tensor_operation::device:: DeviceElementwise_2D; -using ReferenceGemmInstance = ck::tensor_operation::host:: +using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using HostReduceMaxInstance = ReductionHost; + +using HostReduceSumInstance = ReductionHost; + +template +void host_broadcast2D( + HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, int N, Functor functor) +{ + for(int m = 0; m < M; ++m) + { + for(int n = 0; n < N; ++n) + { + if constexpr(broadcastDim == 1) + functor(C(m, n), A(m, n), B(n)); + else + functor(C(m, n), A(m, n), B(m)); + } + } +} + int main(int argc, char* argv[]) { bool do_verification = 0; @@ -189,7 +228,6 @@ int main(int argc, char* argv[]) ck::index_t StrideC = 4096; const std::vector reduceDims{0}; - const std::vector reduceInvariantDims{1}; if(argc == 4) { @@ -237,8 +275,8 @@ int main(int argc, char* argv[]) Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c_n_max(std::vector({static_cast(N)}), - std::vector({1})); + Tensor c_n_max(std::vector({static_cast(N)}), + std::vector({1})); Tensor exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor exp_n_sum(std::vector({static_cast(N)}), std::vector({1})); @@ -370,8 +408,8 @@ int main(int argc, char* argv[]) reduce_n_shape, reduce_n_stride, reduceDims, - 1, - 0, + 1, // alpha + 0, // beta exp_m_n_device_buf.GetDeviceBuffer(), exp_n_sum_device_buf.GetDeviceBuffer(), indices_device_buf.GetDeviceBuffer(), @@ -410,6 +448,66 @@ int main(int argc, char* argv[]) broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat); // TODO = do_verification - (void)do_verification; + if(do_verification) + { + std::cout << "verification..." << std::endl; + const std::vector reduceInvariantDims{1}; + Tensor host_c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor host_c_n_max(std::vector({static_cast(N)}), + std::vector({1})); + Tensor host_indices(host_c_n_max.mDesc.GetLengths()); + Tensor host_exp_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor host_exp_n_sum(std::vector({static_cast(N)}), + std::vector({1})); + Tensor host_softmax_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + auto host_gemm = HostGemmInstance{}; + auto host_gemm_invoker = host_gemm.MakeInvoker(); + auto host_gemm_argument = host_gemm.MakeArgument( + a_m_k, b_k_n, host_c_m_n, PassThrough{}, PassThrough{}, PassThrough{}); + + auto host_reduce_max = HostReduceMaxInstance{ + host_c_m_n.mDesc, host_c_n_max.mDesc, reduceInvariantDims, reduceDims}; + + auto host_reduce_sum = HostReduceSumInstance{ + host_exp_m_n.mDesc, host_exp_n_sum.mDesc, reduceInvariantDims, reduceDims}; + + host_gemm_invoker.Run(host_gemm_argument); + host_reduce_max.Run(1, // alpha + reinterpret_cast(host_c_m_n.mData.data()), + 0, // beta + reinterpret_cast(host_c_n_max.mData.data()), + host_indices.mData.data()); + + host_broadcast2D, Tensor, Tensor, Sub_Exp, 1>( + host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{}); + + host_reduce_sum.Run(1, // alpha + reinterpret_cast(host_exp_m_n.mData.data()), + 0, // beta + reinterpret_cast(host_exp_n_sum.mData.data()), + host_indices.mData.data()); + + host_broadcast2D, Tensor, Tensor, Div, 1>( + host_softmax_m_n, host_exp_m_n, host_exp_n_sum, M, N, Div{}); + + c_m_n_device_buf.FromDevice(c_m_n.mData.data()); + c_n_max_device_buf.FromDevice(c_n_max.mData.data()); + exp_m_n_device_buf.FromDevice(exp_m_n.mData.data()); + exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data()); + softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data()); + + bool result = true; + if (result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) + std::cout << "[PASS] - c_m_n" << std::endl; + if (result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData)) + std::cout << "[PASS] - c_n_max" << std::endl; + if (result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData)) + std::cout << "[PASS] - exp_m_n" << std::endl; + if (result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData)) + std::cout << "[PASS] - exp_n_sum" << std::endl; + if (result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData)) + std::cout << "[PASS] - softmax_m_n" << std::endl; + } return 0; } From e83b22e052e386f553ce2ddbedd66deb0f7ee167 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 18 Apr 2022 10:44:29 +0000 Subject: [PATCH 14/28] [What] Use half_float::half instead of ck::half_t for host reduction [Why] std::numeric_limits<_Float16>::lowest() will return zero --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 922c914f928..3b164b578c9 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -41,6 +41,10 @@ using BDataType = F16; using CDataType = F16; using AccDataType = F32; +// CAUSION - host reduce_max will call numeric_limits::lowest() +// However, numeric_limits::lowest() will return zero. So, used half_float::half instead +using HostReduceDataType = half_float::half; + using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; @@ -174,18 +178,18 @@ using DeviceElementwiseDivInstance = ck::tensor_operation::device:: using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; -using HostReduceMaxInstance = ReductionHost; -using HostReduceSumInstance = ReductionHost(host_c_m_n.mData.data()), + reinterpret_cast(host_c_m_n.mData.data()), 0, // beta - reinterpret_cast(host_c_n_max.mData.data()), + reinterpret_cast(host_c_n_max.mData.data()), host_indices.mData.data()); host_broadcast2D, Tensor, Tensor, Sub_Exp, 1>( host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{}); host_reduce_sum.Run(1, // alpha - reinterpret_cast(host_exp_m_n.mData.data()), + reinterpret_cast(host_exp_m_n.mData.data()), 0, // beta - reinterpret_cast(host_exp_n_sum.mData.data()), + reinterpret_cast(host_exp_n_sum.mData.data()), host_indices.mData.data()); host_broadcast2D, Tensor, Tensor, Div, 1>( @@ -498,15 +502,15 @@ int main(int argc, char* argv[]) softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data()); bool result = true; - if (result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) + if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) std::cout << "[PASS] - c_m_n" << std::endl; - if (result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData)) + if(result &= ck::utils::check_err(c_n_max.mData, host_c_n_max.mData)) std::cout << "[PASS] - c_n_max" << std::endl; - if (result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData)) + if(result &= ck::utils::check_err(exp_m_n.mData, host_exp_m_n.mData)) std::cout << "[PASS] - exp_m_n" << std::endl; - if (result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData)) + if(result &= ck::utils::check_err(exp_n_sum.mData, host_exp_n_sum.mData)) std::cout << "[PASS] - exp_n_sum" << std::endl; - if (result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData)) + if(result &= ck::utils::check_err(softmax_m_n.mData, host_softmax_m_n.mData)) std::cout << "[PASS] - softmax_m_n" << std::endl; } return 0; From 21802fda18e7e659418642c5befd163932968896 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 18 Apr 2022 11:09:17 +0000 Subject: [PATCH 15/28] [What] Sync input of each host kernel and device kernel [Why] Prevent error propogation --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 3b164b578c9..2326168e1e7 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -455,6 +455,13 @@ int main(int argc, char* argv[]) if(do_verification) { std::cout << "verification..." << std::endl; + + c_m_n_device_buf.FromDevice(c_m_n.mData.data()); + c_n_max_device_buf.FromDevice(c_n_max.mData.data()); + exp_m_n_device_buf.FromDevice(exp_m_n.mData.data()); + exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data()); + softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data()); + const std::vector reduceInvariantDims{1}; Tensor host_c_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor host_c_n_max(std::vector({static_cast(N)}), @@ -478,28 +485,22 @@ int main(int argc, char* argv[]) host_gemm_invoker.Run(host_gemm_argument); host_reduce_max.Run(1, // alpha - reinterpret_cast(host_c_m_n.mData.data()), + reinterpret_cast(c_m_n.mData.data()), 0, // beta reinterpret_cast(host_c_n_max.mData.data()), host_indices.mData.data()); host_broadcast2D, Tensor, Tensor, Sub_Exp, 1>( - host_exp_m_n, host_c_m_n, host_c_n_max, M, N, Sub_Exp{}); + host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); host_reduce_sum.Run(1, // alpha - reinterpret_cast(host_exp_m_n.mData.data()), + reinterpret_cast(exp_m_n.mData.data()), 0, // beta reinterpret_cast(host_exp_n_sum.mData.data()), host_indices.mData.data()); host_broadcast2D, Tensor, Tensor, Div, 1>( - host_softmax_m_n, host_exp_m_n, host_exp_n_sum, M, N, Div{}); - - c_m_n_device_buf.FromDevice(c_m_n.mData.data()); - c_n_max_device_buf.FromDevice(c_n_max.mData.data()); - exp_m_n_device_buf.FromDevice(exp_m_n.mData.data()); - exp_n_sum_device_buf.FromDevice(exp_n_sum.mData.data()); - softmax_m_n_device_buf.FromDevice(softmax_m_n.mData.data()); + host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{}); bool result = true; if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) From cf3266902d26e828d47b367838d45790a9014b22 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 20 Apr 2022 10:32:39 +0000 Subject: [PATCH 16/28] [What] Use F32 as the acc of reduce sum [Why] Prevent loss of precision --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 2326168e1e7..73556331b56 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -98,15 +98,15 @@ constexpr ck::ReduceTensorOp ReduceMaxId = ck::ReduceTensorOp::MAX; constexpr ck::ReduceTensorOp ReduceSumId = ck::ReduceTensorOp::ADD; constexpr bool ReducePropagateNan = false; using ReduceMaxOp = typename ck::reduce_binary_operator::opType; -using ReduceSumOp = typename ck::reduce_binary_operator::opType; +using ReduceSumOp = typename ck::reduce_binary_operator::opType; using ReduceMaxInElementwiseOperation = typename ck::reduce_unary_operator::InElementwiseOperation; using ReduceMaxAccElementwiseOperation = typename ck::reduce_unary_operator::AccElementwiseOperation; using ReduceSumInElementwiseOperation = - typename ck::reduce_unary_operator::InElementwiseOperation; + typename ck::reduce_unary_operator::InElementwiseOperation; using ReduceSumAccElementwiseOperation = - typename ck::reduce_unary_operator::AccElementwiseOperation; + typename ck::reduce_unary_operator::AccElementwiseOperation; using DeviceReduceMaxInstance = ck::tensor_operation::device::DeviceReduceBlockWise; using HostReduceSumInstance = ReductionHost Date: Wed, 20 Apr 2022 10:57:41 +0000 Subject: [PATCH 17/28] [What] Add ComputeDataType to the eltwise kernel [Why] Similar to acc datatype, it increase precision --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 75 ++++++++++++------- .../gpu/device/device_elementwise_2d.hpp | 2 + .../gpu/grid/gridwise_elementwise_1d.hpp | 13 ++-- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 73556331b56..0accded132e 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -36,10 +36,11 @@ using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ADataType = F16; -using BDataType = F16; -using CDataType = F16; -using AccDataType = F32; +using ADataType = F16; +using BDataType = F16; +using CDataType = F16; +using AccDataType = F32; +using EltwiseComputeDataType = F32; // CAUSION - host reduce_max will call numeric_limits::lowest() // However, numeric_limits::lowest() will return zero. So, used half_float::half instead @@ -103,10 +104,10 @@ using ReduceMaxInElementwiseOperation = typename ck::reduce_unary_operator::InElementwiseOperation; using ReduceMaxAccElementwiseOperation = typename ck::reduce_unary_operator::AccElementwiseOperation; -using ReduceSumInElementwiseOperation = - typename ck::reduce_unary_operator::InElementwiseOperation; -using ReduceSumAccElementwiseOperation = - typename ck::reduce_unary_operator::AccElementwiseOperation; +using ReduceSumInElementwiseOperation = typename ck:: + reduce_unary_operator::InElementwiseOperation; +using ReduceSumAccElementwiseOperation = typename ck:: + reduce_unary_operator::AccElementwiseOperation; using DeviceReduceMaxInstance = ck::tensor_operation::device::DeviceReduceBlockWise(dst); - dst = static_cast(exp(dst_f32)); + dst = exp(src1 - src2); } }; struct Div { - __host__ __device__ constexpr void - operator()(CDataType& dst, const CDataType& src1, const CDataType& src2) const + __host__ __device__ constexpr void operator()(EltwiseComputeDataType& dst, + const EltwiseComputeDataType& src1, + const EltwiseComputeDataType& src2) const { dst = src1 / src2; } }; -using DeviceElementwiseSubExpInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; +using DeviceElementwiseSubExpInstance = + ck::tensor_operation::device::DeviceElementwise_2D; using DeviceElementwiseDivInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; + DeviceElementwise_2D; using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -199,6 +206,7 @@ using HostReduceSumInstance = ReductionHost void host_broadcast2D( @@ -208,10 +216,19 @@ void host_broadcast2D( { for(int n = 0; n < N; ++n) { + ComputeDataType Amn = static_cast(A(m, n)); + ComputeDataType Cmn = 0; if constexpr(broadcastDim == 1) - functor(C(m, n), A(m, n), B(n)); + { + ComputeDataType Bn = static_cast(B(n)); + functor(Cmn, Amn, Bn); + } else - functor(C(m, n), A(m, n), B(m)); + { + ComputeDataType Bm = static_cast(B(m)); + functor(Cmn, Amn, Bm); + } + C(m, n) = static_cast(Cmn); } } } @@ -490,8 +507,12 @@ int main(int argc, char* argv[]) reinterpret_cast(host_c_n_max.mData.data()), host_indices.mData.data()); - host_broadcast2D, Tensor, Tensor, Sub_Exp, 1>( - host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); + host_broadcast2D, + Tensor, + Tensor, + EltwiseComputeDataType, + Sub_Exp, + 1>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); host_reduce_sum.Run(1, // alpha reinterpret_cast(exp_m_n.mData.data()), @@ -499,8 +520,12 @@ int main(int argc, char* argv[]) reinterpret_cast(host_exp_n_sum.mData.data()), host_indices.mData.data()); - host_broadcast2D, Tensor, Tensor, Div, 1>( - host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{}); + host_broadcast2D, + Tensor, + Tensor, + EltwiseComputeDataType, + Div, + 1>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{}); bool result = true; if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp index 182929f63cd..5d4054dad2b 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -13,6 +13,7 @@ namespace device { template using GridwiseEltwise = GridwiseElementwise_1D( p_c_global, c_grid_desc_m0.GetElementSpaceSize()); - StaticBuffer a_thread_buf; - StaticBuffer b_thread_buf; - StaticBuffer c_thread_buf; + StaticBuffer a_thread_buf; + StaticBuffer b_thread_buf; + StaticBuffer c_thread_buf; const auto thread_to_global_offset = CalculateElementwiseIndex(); auto a_global_load = ThreadwiseTensorSliceTransfer_v2, // SliceLengths @@ -90,7 +91,7 @@ struct GridwiseElementwise_1D auto b_global_load = ThreadwiseTensorSliceTransfer_v2, // SliceLengths @@ -101,7 +102,7 @@ struct GridwiseElementwise_1D false>{b_grid_desc_m0, thread_to_global_offset}; auto c_global_write = - ThreadwiseTensorSliceTransfer_v1r3 Date: Wed, 20 Apr 2022 12:12:51 +0000 Subject: [PATCH 18/28] Add padding --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 1 - .../gpu/device/device_elementwise_2d.hpp | 19 +++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 0accded132e..aafa54cca82 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -468,7 +468,6 @@ int main(int argc, char* argv[]) auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer(); broadcastDiv_invoker_ptr->Run(broadcastDiv_argument_ptr.get(), nrepeat); - // TODO = do_verification if(do_verification) { std::cout << "verification..." << std::endl; diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp index 5d4054dad2b..ccccb5d6a85 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp @@ -34,10 +34,21 @@ struct DeviceElementwise_2D : public DeviceElementwise make_naive_tensor_descriptor(make_tuple(m, n), make_tuple(stride[0], stride[1])); // 1d desc - [m * n] - return transform_tensor_descriptor(desc_m_n, - make_tuple(make_merge_transform(make_tuple(m, n))), - make_tuple(Sequence<0, 1>{}), - make_tuple(Sequence<0>{})); + const auto desc_m0 = + transform_tensor_descriptor(desc_m_n, + make_tuple(make_merge_transform(make_tuple(m, n))), + make_tuple(Sequence<0, 1>{}), + make_tuple(Sequence<0>{})); + + // pad + const auto m0 = desc_m0.GetLength(I0); + const auto pad = math::integer_least_multiple(m0, ScalarPerVector) - m0; + const auto desc_m0_pad = + transform_tensor_descriptor(desc_m0, + make_tuple(make_right_pad_transform(m0, pad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return desc_m0_pad; } using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1})); From 0e6bf342df9b985094b36f8b5169d3ac750c4b4a Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 20 Apr 2022 12:24:36 +0000 Subject: [PATCH 19/28] Rename elementwise p[ to binary elementwise --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 10 +++--- ...wise.hpp => device_binary_elementwise.hpp} | 2 +- ...d.hpp => device_binary_elementwise_2d.hpp} | 33 ++++++++++--------- ...hpp => gridwise_binary_elementwise_1d.hpp} | 18 +++++----- 4 files changed, 33 insertions(+), 30 deletions(-) rename include/ck/tensor_operation/gpu/device/{device_elementwise.hpp => device_binary_elementwise.hpp} (93%) rename include/ck/tensor_operation/gpu/device/{device_elementwise_2d.hpp => device_binary_elementwise_2d.hpp} (86%) rename include/ck/tensor_operation/gpu/grid/{gridwise_elementwise_1d.hpp => gridwise_binary_elementwise_1d.hpp} (95%) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index aafa54cca82..08aefd87c52 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -23,7 +23,7 @@ #include "device_reduce_blockwise.hpp" #include "reduction_enums.hpp" #include "reduction_operator_mapping.hpp" -#include "device_elementwise_2d.hpp" +#include "device_binary_elementwise_2d.hpp" template using S = ck::Sequence; @@ -170,7 +170,7 @@ struct Div }; using DeviceElementwiseSubExpInstance = - ck::tensor_operation::device::DeviceElementwise_2D; using DeviceElementwiseDivInstance = ck::tensor_operation::device:: - DeviceElementwise_2D; + DeviceBinaryElementwise_2D; using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; @@ -412,7 +412,7 @@ int main(int argc, char* argv[]) if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get())) { throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceElementwise_2D instance, exiting!"); + "DeviceBinaryElementwise_2D instance, exiting!"); }; auto broadcastSubExp_invoker_ptr = broadcastSubExp.MakeInvokerPointer(); @@ -462,7 +462,7 @@ int main(int argc, char* argv[]) if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get())) { throw std::runtime_error("The runtime parameters seems not supported by the " - "DeviceElementwise_2D instance, exiting!"); + "DeviceBinaryElementwise_2D instance, exiting!"); }; auto broadcastDiv_invoker_ptr = broadcastDiv.MakeInvokerPointer(); diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp similarity index 93% rename from include/ck/tensor_operation/gpu/device/device_elementwise.hpp rename to include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index e79e1112256..8809a1090ca 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -9,7 +9,7 @@ namespace tensor_operation { namespace device { template -struct DeviceElementwise : public BaseOperator +struct DeviceBinaryElementwise : public BaseOperator { virtual std::unique_ptr diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp similarity index 86% rename from include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp rename to include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp index ccccb5d6a85..fd67ed62746 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp @@ -3,8 +3,8 @@ #include #include "device.hpp" -#include "device_elementwise.hpp" -#include "gridwise_elementwise_1d.hpp" +#include "device_binary_elementwise.hpp" +#include "gridwise_binary_elementwise_1d.hpp" namespace ck { namespace tensor_operation { @@ -18,7 +18,7 @@ template -struct DeviceElementwise_2D : public DeviceElementwise +struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise { static_assert(ThreadTileSize % ScalarPerVector == 0); static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; @@ -51,16 +51,16 @@ struct DeviceElementwise_2D : public DeviceElementwise return desc_m0_pad; } - using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1})); - using GridwiseEltwise = GridwiseElementwise_1D; + using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1})); + using GridwiseBinEltwise = GridwiseBinaryElementwise_1D; struct Argument : public BaseArgument { @@ -101,7 +101,7 @@ struct DeviceElementwise_2D : public DeviceElementwise float Run(const Argument& arg, int nrepeat = 1) { - const auto kernel = kernel_elementwise_1d auto str = std::stringstream(); // clang-format off - str << "DeviceElementwise_2D" + str << "DeviceBinaryElementwise_2D" << "<" + << "ThreadPerBlock = " << ThreadPerBlock + << "ThreadTileSize = " << ThreadTileSize + << "ScalarPerVector = " << ScalarPerVector << ">"; // clang-format on diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp similarity index 95% rename from include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp rename to include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp index 789345c1312..0e2be0e4519 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp @@ -7,7 +7,7 @@ namespace ck { -template -struct GridwiseElementwise_1D +struct GridwiseBinaryElementwise_1D { static constexpr auto I0 = Number<0>{}; static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; From d7112d37f2d83e3d8b7f384dac7a60abe1adfc49 Mon Sep 17 00:00:00 2001 From: rocking Date: Wed, 20 Apr 2022 17:21:02 +0000 Subject: [PATCH 20/28] Fix the padding --- .../gpu/device/device_binary_elementwise_2d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp index fd67ed62746..5417314683f 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp @@ -42,7 +42,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise Date: Thu, 21 Apr 2022 15:16:17 +0000 Subject: [PATCH 21/28] Rewrite the elementwise operation. Let memory coalesce between block --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 25 ++++++---- .../gpu/device/device_binary_elementwise.hpp | 17 ++++--- .../device/device_binary_elementwise_2d.hpp | 47 ++++++++----------- .../grid/gridwise_binary_elementwise_1d.hpp | 26 +++++----- include/ck/utility/get_id.hpp | 4 ++ 5 files changed, 60 insertions(+), 59 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 08aefd87c52..9d31f62ba6b 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -171,16 +171,21 @@ struct Div using DeviceElementwiseSubExpInstance = ck::tensor_operation::device::DeviceBinaryElementwise_2D; - -using DeviceElementwiseDivInstance = ck::tensor_operation::device:: - DeviceBinaryElementwise_2D; + CDataType, + CDataType, + EltwiseComputeDataType, + Sub_Exp, + 256, + 8>; + +using DeviceElementwiseDivInstance = + ck::tensor_operation::device::DeviceBinaryElementwise_2D; using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index 8809a1090ca..56ff5141fcb 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -12,15 +12,14 @@ template struct DeviceBinaryElementwise : public BaseOperator { - virtual std::unique_ptr - MakeArgumentPointer(const void* p_a, - const void* p_b, - void* p_c, - const std::vector& shape_a, - const std::vector& stride_a, - const std::vector& shape_b, - const std::vector& stride_b, - ElementwiseFunctor functor) = 0; + virtual std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + const std::vector& shape_a, + const std::vector& stride_a, + const std::vector& shape_b, + const std::vector& stride_b, + ElementwiseFunctor functor) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp index 5417314683f..3fa679cc2fe 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp @@ -16,15 +16,14 @@ template struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise { - static_assert(ThreadTileSize % ScalarPerVector == 0); - static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; - static constexpr auto I0 = Number<0>{}; + static constexpr auto I0 = Number<0>{}; - static auto MakeDescriptor_M0(const std::vector& shape, const std::vector& stride) + static auto MakeDescriptor_M0(const std::vector& shape, + const std::vector& stride, + index_t gridSize) { const int m = shape[0]; const int n = shape[1]; @@ -41,8 +40,9 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise{})); // pad - const auto m0 = desc_m0.GetLength(I0); - const auto pad = math::integer_least_multiple(m0, BlockTileSize) - m0; + const auto m0 = desc_m0.GetLength(I0); + const index_t loop_step = gridSize * ThreadPerBlock * ScalarPerVector; + const auto pad = math::integer_least_multiple(m0, loop_step) - m0; const auto desc_m0_pad = transform_tensor_descriptor(desc_m0, make_tuple(make_right_pad_transform(m0, pad)), @@ -51,15 +51,13 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise; struct Argument : public BaseArgument @@ -75,11 +73,12 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise; - float avgTime = 0; - const index_t gridSize = CalculateGridSize(arg.c_grid_desc_m0_); + float avgTime = 0; if(nrepeat == 0) { launch_kernel(kernel, - dim3(gridSize), + dim3(arg.gridSize_), dim3(ThreadPerBlock), 0, arg.p_a_, @@ -127,7 +121,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwisec_grid_desc_m0_.GetLength(I0); - if(m0 % BlockTileSize != 0) + if(m0 % ScalarPerVector != 0) return false; return true; @@ -195,7 +189,6 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise"; // clang-format on diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp index 0e2be0e4519..18eb4017e29 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp @@ -36,13 +36,10 @@ template struct GridwiseBinaryElementwise_1D { - static constexpr auto I0 = Number<0>{}; - static constexpr int BlockTileSize = ThreadPerBlock * ThreadTileSize; + static constexpr auto I0 = Number<0>{}; static constexpr auto thread_desc_M0 = make_naive_tensor_descriptor_packed(make_tuple(Number{})); @@ -50,10 +47,8 @@ struct GridwiseBinaryElementwise_1D static __device__ __host__ auto CalculateElementwiseIndex() { - const index_t thread_id = get_thread_local_1d_id(); - const index_t block_id = get_block_1d_id(); - - return make_multi_index(block_id * BlockTileSize + thread_id * ScalarPerVector); + const index_t global_thread_id = get_thread_global_1d_id(); + return make_multi_index(global_thread_id * ScalarPerVector); } __device__ static void Run(const ADataType* __restrict__ p_a_global, @@ -116,8 +111,13 @@ struct GridwiseBinaryElementwise_1D false>{ c_grid_desc_m0, thread_to_global_offset, PassThrough{}}; - int num_iter = ThreadTileSize / ScalarPerVector; - constexpr auto thread_to_global_step = make_multi_index(ThreadPerBlock * ScalarPerVector); + const index_t threadPerBlock = get_block_size(); + const index_t blockPerGrid = get_grid_size(); + const auto m0 = c_grid_desc_m0.GetLength(I0); + const index_t loop_step = blockPerGrid * threadPerBlock * ScalarPerVector; + const auto loop_step_index = make_multi_index(loop_step); + + index_t num_iter = m0 / (loop_step); do { // read and process ScalarPerVector elements @@ -140,9 +140,9 @@ struct GridwiseBinaryElementwise_1D c_grid_desc_m0, c_global_buf); - a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, thread_to_global_step); - b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, thread_to_global_step); - c_global_write.MoveDstSliceWindow(c_grid_desc_m0, thread_to_global_step); + a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index); + b_global_load.MoveSrcSliceWindow(b_grid_desc_m0, loop_step_index); + c_global_write.MoveDstSliceWindow(c_grid_desc_m0, loop_step_index); } while(--num_iter); } }; diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp index f742512d400..d2a689c1cca 100644 --- a/include/ck/utility/get_id.hpp +++ b/include/ck/utility/get_id.hpp @@ -7,10 +7,14 @@ __device__ constexpr index_t get_wave_size() { return CK_GPU_WAVE_SIZE; } __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } +__device__ index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; } + __device__ index_t get_wave_local_1d_id() { return threadIdx.x / get_wave_size(); } __device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_grid_size() { return gridDim.x; } +__device__ index_t get_block_size() { return blockDim.x; } + } // namespace ck From 680cfaa715dbae4d079cc6bd5b676b909f371376 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 22 Apr 2022 04:20:08 +0000 Subject: [PATCH 22/28] Fix the meaning of broadcast dim parameter --- example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 9d31f62ba6b..a0522f7fc79 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -223,7 +223,7 @@ void host_broadcast2D( { ComputeDataType Amn = static_cast(A(m, n)); ComputeDataType Cmn = 0; - if constexpr(broadcastDim == 1) + if constexpr(broadcastDim == 0) { ComputeDataType Bn = static_cast(B(n)); functor(Cmn, Amn, Bn); @@ -516,7 +516,7 @@ int main(int argc, char* argv[]) Tensor, EltwiseComputeDataType, Sub_Exp, - 1>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); + 0>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); host_reduce_sum.Run(1, // alpha reinterpret_cast(exp_m_n.mData.data()), @@ -529,7 +529,7 @@ int main(int argc, char* argv[]) Tensor, EltwiseComputeDataType, Div, - 1>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{}); + 0>(host_softmax_m_n, exp_m_n, exp_n_sum, M, N, Div{}); bool result = true; if(result &= ck::utils::check_err(c_m_n.mData, host_c_m_n.mData)) From a41f5481d4adf974864dc48ace28bd395ba6223b Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 25 Apr 2022 09:12:23 +0000 Subject: [PATCH 23/28] 1. Fix coding style 2. Use DeviceGemm_Xdl_CShuffle instead of deprecated DeviceGemmXdl_C_Shuffle --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 26 +++++++++++-------- .../grid/gridwise_binary_elementwise_1d.hpp | 16 ++++++------ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index a0522f7fc79..0b4d0c41b34 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -15,7 +15,7 @@ #include "device_tensor.hpp" #include "device_gemm_xdl.hpp" -#include "device_gemm_xdl_c_shuffle.hpp" +#include "device_gemm_xdl_cshuffle.hpp" #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" @@ -50,19 +50,23 @@ using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + // clang-format off -using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle< +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle< + ALayout, // ALayout + BLayout, // BLayout + CLayout, // CLayout ADataType, // ADataType BDataType, // BDataType CDataType, // CDataType AccDataType, // AccDataType CDataType, // CShuffleDataType - ALayout, // ALayout - BLayout, // BLayout - CLayout, // CLayout PassThrough, // AElementwiseOperation PassThrough, // BElementwiseOperation PassThrough, // CElementwiseOperation + GemmDefault, // GemmSpec + 1, // NumGemmKPrefetchStage 256, // BlockSize 256, // MPerBlock 128, // NPerBlock @@ -89,7 +93,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle true, // BBlockLdsAddExtraN 1, // CShuffleMXdlPerWavePerShuffle 1, // CShuffleNXdlPerWavePerShuffle - S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl + S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on @@ -149,7 +153,7 @@ using DeviceReduceSumInstance = 1, 1>; -struct Sub_Exp +struct SubExp { __host__ __device__ constexpr void operator()(EltwiseComputeDataType& dst, const EltwiseComputeDataType& src1, @@ -174,7 +178,7 @@ using DeviceElementwiseSubExpInstance = CDataType, CDataType, EltwiseComputeDataType, - Sub_Exp, + SubExp, 256, 8>; @@ -412,7 +416,7 @@ int main(int argc, char* argv[]) {StrideC, 1}, {0, 1}, {StrideC, 1}, - Sub_Exp{}); + SubExp{}); if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get())) { @@ -515,8 +519,8 @@ int main(int argc, char* argv[]) Tensor, Tensor, EltwiseComputeDataType, - Sub_Exp, - 0>(host_exp_m_n, c_m_n, c_n_max, M, N, Sub_Exp{}); + SubExp, + 0>(host_exp_m_n, c_m_n, c_n_max, M, N, SubExp{}); host_reduce_sum.Run(1, // alpha reinterpret_cast(exp_m_n.mData.data()), diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp index 18eb4017e29..aea54ff53c4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp @@ -40,7 +40,7 @@ template {}; - static constexpr auto thread_desc_M0 = + static constexpr auto thread_desc_m0 = make_naive_tensor_descriptor_packed(make_tuple(Number{})); using PassThrough = tensor_operation::element_wise::PassThrough; @@ -76,7 +76,7 @@ struct GridwiseBinaryElementwise_1D ThreadwiseTensorSliceTransfer_v2, // SliceLengths Sequence<0>, // DimAccessOrder 0, // SrcVectorDim @@ -88,7 +88,7 @@ struct GridwiseBinaryElementwise_1D ThreadwiseTensorSliceTransfer_v2, // SliceLengths Sequence<0>, // DimAccessOrder 0, // SrcVectorDim @@ -99,7 +99,7 @@ struct GridwiseBinaryElementwise_1D auto c_global_write = ThreadwiseTensorSliceTransfer_v1r3, // SliceLengths @@ -122,19 +122,19 @@ struct GridwiseBinaryElementwise_1D { // read and process ScalarPerVector elements a_global_load.Run( - a_grid_desc_m0, a_global_buf, thread_desc_M0, make_tuple(I0), a_thread_buf); + a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf); b_global_load.Run( - b_grid_desc_m0, b_global_buf, thread_desc_M0, make_tuple(I0), b_thread_buf); + b_grid_desc_m0, b_global_buf, thread_desc_m0, make_tuple(I0), b_thread_buf); static_for<0, ScalarPerVector, 1>{}([&](auto m) { - constexpr auto offset = thread_desc_M0.CalculateOffset(make_tuple(m)); + constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m)); functor(c_thread_buf(Number{}), a_thread_buf(Number{}), b_thread_buf(Number{})); }); - c_global_write.Run(thread_desc_M0, + c_global_write.Run(thread_desc_m0, make_tuple(I0), // SrcSliceOriginIdx c_thread_buf, c_grid_desc_m0, From f919809dfa5c174117dfca7bfa1bb283cd37925c Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 26 Apr 2022 01:54:08 +0000 Subject: [PATCH 24/28] Move threadPerBlock to argument --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 8 ++--- .../gpu/device/device_binary_elementwise.hpp | 3 +- .../device/device_binary_elementwise_2d.hpp | 32 +++++++++++-------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 0b4d0c41b34..b477ff68d46 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -179,7 +179,6 @@ using DeviceElementwiseSubExpInstance = CDataType, EltwiseComputeDataType, SubExp, - 256, 8>; using DeviceElementwiseDivInstance = @@ -188,7 +187,6 @@ using DeviceElementwiseDivInstance = CDataType, EltwiseComputeDataType, Div, - 256, 8>; using HostGemmInstance = ck::tensor_operation::host:: @@ -416,7 +414,8 @@ int main(int argc, char* argv[]) {StrideC, 1}, {0, 1}, {StrideC, 1}, - SubExp{}); + SubExp{}, + 256); if(!broadcastSubExp.IsSupportedArgument(broadcastSubExp_argument_ptr.get())) { @@ -466,7 +465,8 @@ int main(int argc, char* argv[]) {StrideC, 1}, {0, 1}, {StrideC, 1}, - Div{}); + Div{}, + 256); if(!broadcastDiv.IsSupportedArgument(broadcastDiv_argument_ptr.get())) { diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index 56ff5141fcb..f97e7d835ff 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -19,7 +19,8 @@ struct DeviceBinaryElementwise : public BaseOperator const std::vector& stride_a, const std::vector& shape_b, const std::vector& stride_b, - ElementwiseFunctor functor) = 0; + ElementwiseFunctor functor, + index_t threadPerBlock) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp index 3fa679cc2fe..370efd483e5 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp @@ -15,7 +15,6 @@ template struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise { @@ -23,7 +22,8 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise& shape, const std::vector& stride, - index_t gridSize) + index_t gridSize, + index_t threadPerBlock) { const int m = shape[0]; const int n = shape[1]; @@ -41,7 +41,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise& stride_a, const std::vector& stride_b, const std::vector& stride_c, - ElementwiseFunctor functor) + ElementwiseFunctor functor, + index_t threadPerBlock) : p_a_(p_a), p_b_(p_b), p_c_(p_c), functor_(functor), + threadPerBlock_(threadPerBlock), gridSize_(128) // FIXME - Calculate the grid size by number of CU in the future { - a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_); - b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_); - c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_); + a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, threadPerBlock_); + b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, threadPerBlock_); + c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, threadPerBlock_); } const ADataType* p_a_; @@ -88,6 +90,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise; - float avgTime = 0; + float avgTime = 0; if(nrepeat == 0) { launch_kernel(kernel, dim3(arg.gridSize_), - dim3(ThreadPerBlock), + dim3(arg.threadPerBlock_), 0, arg.p_a_, arg.p_b_, @@ -122,7 +125,7 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise& stride_a, const std::vector& stride_b, const std::vector& stride_c, - ElementwiseFunctor functor) override + ElementwiseFunctor functor, + index_t threadPerBlock) override { return std::make_unique(static_cast(p_a), static_cast(p_b), @@ -173,7 +177,8 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise MakeInvokerPointer() override @@ -188,7 +193,6 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise"; // clang-format on From 976815e539cf6e67bd742e2521aca7ddd60f6024 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 28 Apr 2022 16:18:18 +0800 Subject: [PATCH 25/28] Prevent compile error when user pass rvalue, eg {3, 4} --- .../gpu/device/device_binary_elementwise.hpp | 8 ++++---- .../gpu/device/device_binary_elementwise_2d.hpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp index f97e7d835ff..eba2d7979f2 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp @@ -15,10 +15,10 @@ struct DeviceBinaryElementwise : public BaseOperator virtual std::unique_ptr MakeArgumentPointer(const void* p_a, const void* p_b, void* p_c, - const std::vector& shape_a, - const std::vector& stride_a, - const std::vector& shape_b, - const std::vector& stride_b, + std::vector shape_a, + std::vector stride_a, + std::vector shape_b, + std::vector stride_b, ElementwiseFunctor functor, index_t threadPerBlock) = 0; diff --git a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp index 370efd483e5..9d37a1d6439 100644 --- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise_2d.hpp @@ -163,10 +163,10 @@ struct DeviceBinaryElementwise_2D : public DeviceBinaryElementwise MakeArgumentPointer(const void* p_a, const void* p_b, void* p_c, - const std::vector& shape, - const std::vector& stride_a, - const std::vector& stride_b, - const std::vector& stride_c, + std::vector shape, + std::vector stride_a, + std::vector stride_b, + std::vector stride_c, ElementwiseFunctor functor, index_t threadPerBlock) override { From ea09fd3260ca7e342c3ed2757c4cd45c0f119e52 Mon Sep 17 00:00:00 2001 From: rocking Date: Thu, 28 Apr 2022 16:42:53 +0800 Subject: [PATCH 26/28] Fix typo --- example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index b477ff68d46..156e3f467d5 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -42,7 +42,7 @@ using CDataType = F16; using AccDataType = F32; using EltwiseComputeDataType = F32; -// CAUSION - host reduce_max will call numeric_limits::lowest() +// CAUTION - host reduce_max will call numeric_limits::lowest() // However, numeric_limits::lowest() will return zero. So, used half_float::half instead using HostReduceDataType = half_float::half; From bfc80764c257c3f4a403da7780c4c1dd9125c347 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 29 Apr 2022 16:14:17 +0800 Subject: [PATCH 27/28] [What] Fix data type for host reduction [Why] F16 issue for host reduction has been fix in c1ef73192e9303f48bac53327150dac4983af51d --- .../19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index 156e3f467d5..bce9fefae1a 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -42,10 +42,6 @@ using CDataType = F16; using AccDataType = F32; using EltwiseComputeDataType = F32; -// CAUTION - host reduce_max will call numeric_limits::lowest() -// However, numeric_limits::lowest() will return zero. So, used half_float::half instead -using HostReduceDataType = half_float::half; - using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; @@ -192,18 +188,18 @@ using DeviceElementwiseDivInstance = using HostGemmInstance = ck::tensor_operation::host:: ReferenceGemm; -using HostReduceMaxInstance = ReductionHost; -using HostReduceSumInstance = ReductionHost(c_m_n.mData.data()), + c_m_n.mData.data(), 0, // beta - reinterpret_cast(host_c_n_max.mData.data()), + host_c_n_max.mData.data(), host_indices.mData.data()); host_broadcast2D, @@ -523,9 +519,9 @@ int main(int argc, char* argv[]) 0>(host_exp_m_n, c_m_n, c_n_max, M, N, SubExp{}); host_reduce_sum.Run(1, // alpha - reinterpret_cast(exp_m_n.mData.data()), + exp_m_n.mData.data(), 0, // beta - reinterpret_cast(host_exp_n_sum.mData.data()), + host_exp_n_sum.mData.data(), host_indices.mData.data()); host_broadcast2D, From b6fe118ed0355b91956e6ad03ede09b55dc93264 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 9 May 2022 13:37:42 +0800 Subject: [PATCH 28/28] Fix typo --- example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp index bce9fefae1a..13518c1a6e4 100644 --- a/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp +++ b/example/19_gemm_softmax/gemm_softmax_xdl_fp16.cpp @@ -373,8 +373,8 @@ int main(int argc, char* argv[]) // do reduce max auto reduce_max = DeviceReduceMaxInstance{}; - auto reduce_max_workaspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); - DeviceMem reduce_max_workaspace_device_buf(reduce_max_workaspace_size); + auto reduce_max_workspace_size = reduce_max.GetWorkspaceSizeInBytes(c_m_n_shape, reduceDims); + DeviceMem reduce_max_workaspace_device_buf(reduce_max_workspace_size); auto reduce_max_argument_ptr = reduce_max.MakeArgumentPointer( c_m_n_shape,