From a18e648116a4edf0b36da7bf6cecb8a649874994 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 15 Sep 2021 08:49:42 +0000 Subject: [PATCH 01/13] Tiny fix in using data type template parameters in blockwise and direct_threadwise kernel --- .../gridwise_generic_2d_reduction_blockwise.hpp | 2 +- .../gridwise_generic_2d_reduction_direct_threadwise.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp index 20075526b2a..0a950d29b86 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -281,7 +281,7 @@ struct GridwiseReduction_xy_to_x_blockwise ThreadClusterLengths, Sequence<0, 1>, srcDataType, - dstDataType, + compType, src2dDescType, decltype(in_block_desc), Sequence<0, 1>, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp index a38c2dc3356..99aaeef6c04 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp @@ -232,7 +232,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2 Date: Wed, 15 Sep 2021 08:54:54 +0000 Subject: [PATCH 02/13] Fix with regard to implementing GetZeroVal() in both kernel and host --- .../include/utility/reduction_operator.hpp | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index 269671a4000..05484ccf745 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -82,7 +82,7 @@ struct Max { using dataType = T; - __device__ static T GetZeroVal() { return std::numeric_limits::min(); }; + __device__ static T GetZeroVal() { return std::numeric_limits::lowest(); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -127,16 +127,45 @@ struct Min static constexpr bool indexable = true; }; +template +struct AMax +{ + using dataType = T; + + __device__ static T GetZeroVal() { return type_convert{}(0.0f); }; + + __device__ inline constexpr void operator()(T& a, T b) const + { + if(a < b) + a = b; + } + + __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + { + if(a < b) + { + a = b; + changed = true; + } + } + + static constexpr bool indexable = true; +}; + template <> __device__ half_t Max::GetZeroVal() { - return type_convert{}(std::numeric_limits::min()); + const unsigned short binary_lowest = 0xFBFF; + + return *reinterpret_cast(&binary_lowest); }; template <> __device__ half_t Min::GetZeroVal() { - return type_convert{}(std::numeric_limits::max()); + const unsigned short binary_max = 0x7BFF; + + return *reinterpret_cast(&binary_max); }; // Unary operators are usually called element-wisely before the reduction is executed on the @@ -281,8 +310,6 @@ struct reduce_binary_operator using opType = reduce::Add; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; - static constexpr bool indexable = reduce::Add::indexable; }; @@ -292,8 +319,6 @@ struct reduce_binary_operator using opType = reduce::Mul; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Mul::GetZeroVal(); }; - static constexpr bool indexable = reduce::Mul::indexable; }; @@ -303,8 +328,6 @@ struct reduce_binary_operator using opType = reduce::Min; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Min::GetZeroVal(); }; - static constexpr bool indexable = reduce::Min::indexable; }; @@ -314,19 +337,15 @@ struct reduce_binary_operator using opType = reduce::Max; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Max::GetZeroVal(); }; - static constexpr bool indexable = reduce::Max::indexable; }; template struct reduce_binary_operator { - using opType = reduce::Max; + using opType = reduce::AMax; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Max::GetZeroVal(); }; - static constexpr bool indexable = reduce::Max::indexable; }; @@ -336,8 +355,6 @@ struct reduce_binary_operator using opType = reduce::Add; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; - static constexpr bool indexable = reduce::Add::indexable; }; @@ -347,8 +364,6 @@ struct reduce_binary_operator using opType = reduce::Add; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; - static constexpr bool indexable = reduce::Add::indexable; }; @@ -358,8 +373,6 @@ struct reduce_binary_operator using opType = reduce::Add; using dataType = T; - __device__ static T GetZeroVal() { return reduce::Add::GetZeroVal(); }; - static constexpr bool indexable = reduce::Add::indexable; }; From eac1753d2dccd74f31e59156978521f31d83d84c Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 15 Sep 2021 10:58:37 +0000 Subject: [PATCH 03/13] Avoid convert to compType from dstDataType before writting the output value --- ...ridwise_generic_2d_reduction_blockwise.hpp | 30 +++++++++++++------ ...generic_2d_reduction_direct_threadwise.hpp | 30 +++++++++++++------ ...e_generic_2d_reduction_direct_warpwise.hpp | 30 +++++++++++++------ 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp index 0a950d29b86..85d1abec7b8 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -180,6 +180,10 @@ struct GridwiseReduction_xy_to_x_blockwise if(!float_equal_one{}(alpha)) accuValue_buf(I0) *= type_convert{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -200,11 +204,11 @@ struct GridwiseReduction_xy_to_x_blockwise threadwise_dst_load.Run( dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -368,11 +376,11 @@ struct GridwiseReduction_xy_to_x_blockwise make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -570,11 +582,11 @@ struct GridwiseReduction_xy_to_x_blockwise make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3 @@ -271,6 +275,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise if(!float_equal_one{}(alpha)) accuValue_buf(I0) *= type_convert{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -176,11 +180,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise threadwise_dst_load.Run( dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf(I0) * beta); + dstValue_buf(I0) += priorDstValue_buf(I0) * beta; } auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -314,11 +322,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3{}(alpha); + StaticBuffer dstValue_buf; + + dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); + if(!float_equal_zero{}(beta)) { auto threadwise_dst_load = @@ -489,11 +501,11 @@ struct GridwiseReduction_xy_to_x_direct_warpwise make_tuple(I0), priorDstValue_buf); - accuValue_buf(I0) += type_convert{}(priorDstValue_buf[I0] * beta); + dstValue_buf(I0) += priorDstValue_buf[I0] * beta; } auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3 Date: Fri, 17 Sep 2021 10:03:10 +0000 Subject: [PATCH 04/13] Add half_t support to NumericLimits and make constexpr GetZeroVal() of binary operator --- ...ridwise_generic_2d_reduction_blockwise.hpp | 6 ++--- ...generic_2d_reduction_direct_threadwise.hpp | 6 ++--- ...e_generic_2d_reduction_direct_warpwise.hpp | 6 ++--- ...idwise_generic_2d_reduction_multiblock.hpp | 4 +-- .../include/utility/data_type.hpp | 27 ++++++++++++------- .../include/utility/reduction_operator.hpp | 26 ++++-------------- 6 files changed, 33 insertions(+), 42 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp index 85d1abec7b8..de91659cf34 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp index beeb27e471d..48ba8b67707 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise (void)ws_indices_global; (void)indices_global; - const auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)ws_indices_global; - const auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)origReduceLen; - const auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp index b4ebd2f57b6..e6831f2b6df 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise (void)ws_indices_global; (void)indices_global; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)ws_indices_global; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)origReduceLen; - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp index c5638931011..cc65b745302 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp @@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; @@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetZeroVal(); // LDS __shared__ compType p_in_block_values_buffer[BlockBufferSize]; diff --git a/composable_kernel/include/utility/data_type.hpp b/composable_kernel/include/utility/data_type.hpp index bfaac8a939d..07eceb84cff 100644 --- a/composable_kernel/include/utility/data_type.hpp +++ b/composable_kernel/include/utility/data_type.hpp @@ -1008,20 +1008,27 @@ struct inner_product_with_conversion }; template -struct NumericLimits; +struct NumericLimits +{ + __host__ __device__ static constexpr T Min() { return std::numeric_limits::min(); } + + __host__ __device__ static constexpr T Max() { return std::numeric_limits::max(); } + + __host__ __device__ static constexpr T Lowest() { return std::numeric_limits::lowest(); } +}; template <> -struct NumericLimits +struct NumericLimits { - __host__ __device__ static constexpr int32_t Min() - { - return std::numeric_limits::min(); - } + static constexpr unsigned short binary_min = 0x0400; + static constexpr unsigned short binary_max = 0x7BFF; + static constexpr unsigned short binary_lowest = 0xFBFF; - __host__ __device__ static constexpr int32_t Max() - { - return std::numeric_limits::max(); - } + __host__ __device__ static constexpr half_t Min() { return as_type(binary_min); } + + __host__ __device__ static constexpr half_t Max() { return as_type(binary_max); } + + __host__ __device__ static constexpr half_t Lowest() { return as_type(binary_lowest); } }; } // namespace ck diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index 05484ccf745..edc2f41735b 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -58,7 +58,7 @@ struct Add { using dataType = T; - __device__ static T GetZeroVal() { return type_convert{}(0.0f); }; + __device__ static constexpr T GetZeroVal() { return static_cast(0.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } @@ -70,7 +70,7 @@ struct Mul { using dataType = T; - __device__ static T GetZeroVal() { return type_convert{}(1.0f); }; + __device__ static constexpr T GetZeroVal() { return static_cast(1.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } @@ -82,7 +82,7 @@ struct Max { using dataType = T; - __device__ static T GetZeroVal() { return std::numeric_limits::lowest(); }; + __device__ static constexpr T GetZeroVal() { return NumericLimits::lowest(); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -107,7 +107,7 @@ struct Min { using dataType = T; - __device__ static T GetZeroVal() { return std::numeric_limits::max(); }; + __device__ static constexpr T GetZeroVal() { return NumericLimits::Max(); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -132,7 +132,7 @@ struct AMax { using dataType = T; - __device__ static T GetZeroVal() { return type_convert{}(0.0f); }; + __device__ static constexpr T GetZeroVal() { return static_cast(0.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -152,22 +152,6 @@ struct AMax static constexpr bool indexable = true; }; -template <> -__device__ half_t Max::GetZeroVal() -{ - const unsigned short binary_lowest = 0xFBFF; - - return *reinterpret_cast(&binary_lowest); -}; - -template <> -__device__ half_t Min::GetZeroVal() -{ - const unsigned short binary_max = 0x7BFF; - - return *reinterpret_cast(&binary_max); -}; - // Unary operators are usually called element-wisely before the reduction is executed on the // elements. // They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 From 92e1588dc7d0be121ffba12556ee7b6ff0d49460 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sun, 5 Sep 2021 14:26:01 +0000 Subject: [PATCH 05/13] Add CONSTANT decorator for descriptor read buffer --- ...ric_reduction_first_call_blockwise_reduce_all_dims.cpp | 6 +++--- ...reduction_first_call_blockwise_reduce_partial_dims.cpp | 6 +++--- ...ic_reduction_first_call_multiblock_reduce_all_dims.cpp | 8 ++++---- ...eduction_first_call_multiblock_reduce_partial_dims.cpp | 8 ++++---- ...ic_reduction_first_call_threadwise_reduce_all_dims.cpp | 6 +++--- ...eduction_first_call_threadwise_reduce_partial_dims.cpp | 6 +++--- ...eric_reduction_first_call_warpwise_reduce_all_dims.cpp | 6 +++--- ..._reduction_first_call_warpwise_reduce_partial_dims.cpp | 6 +++--- .../gridwise_generic_reduction_second_call_blockwise.cpp | 8 ++++---- .../gridwise_generic_reduction_second_call_threadwise.cpp | 8 ++++---- .../gridwise_generic_reduction_second_call_warpwise.cpp | 8 ++++---- 11 files changed, 38 insertions(+), 38 deletions(-) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp index e16010dee1a..752cec7d20c 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -277,15 +277,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp index cba7ffe2958..bc5eb88fff0 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp @@ -278,15 +278,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp index 34b877027c2..0fafff722b5 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp @@ -279,16 +279,16 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)p_dst_global; (void)indices_global; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; - void* ws_buf1_global = static_cast(ws_global) + 4096; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp index 9c7318dc156..24403ba4b23 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp @@ -279,16 +279,16 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)p_dst_global; (void)indices_global; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; - void* ws_buf1_global = static_cast(ws_global) + 4096; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp index 8e67d1faa1a..884ab6a7783 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp @@ -290,15 +290,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp index fdbcda64ba5..3e15b00f10c 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp @@ -291,15 +291,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp index 8aa1376c3a1..c98d9d6bdbf 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp @@ -291,15 +291,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp index e18d623fe50..434f651cf9b 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp @@ -292,15 +292,15 @@ extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)BlkGroupSize; (void)ws_buf2_bytes_offset; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp index b7b58cbb908..dc11200b836 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp @@ -237,15 +237,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)p_src_global; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; - void* ws_buf1_global = static_cast(ws_global) + 4096; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp index ef88547028c..402c4c7f1fa 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp @@ -251,15 +251,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)p_src_global; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; - void* ws_buf1_global = static_cast(ws_global) + 4096; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp index 53b0e1e7599..81897cfc136 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp @@ -252,15 +252,15 @@ extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, const void* __restrict__ p_src_global, float beta, void* __restrict__ p_dst_global, - void* __restrict__ ws_global, + const void CONSTANT* ws_global, long ws_buf2_bytes_offset, void* __restrict__ indices_global) { (void)p_src_global; - const void* p_src2dDesc = ws_global; - const void* p_dst1dDesc = static_cast(ws_global) + 2048; - void* ws_buf1_global = static_cast(ws_global) + 4096; + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); From 52ae56f8598a6b34b9682e4b202f2fea7346b9b9 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Thu, 2 Sep 2021 09:15:13 +0000 Subject: [PATCH 06/13] Use get_thread_local_1d_id() for thread local Id --- ...ric_reduction_first_call_blockwise_reduce_all_dims.cpp | 6 +++--- ...reduction_first_call_blockwise_reduce_partial_dims.cpp | 6 +++--- ...ic_reduction_first_call_multiblock_reduce_all_dims.cpp | 6 +++--- ...eduction_first_call_multiblock_reduce_partial_dims.cpp | 6 +++--- ...ic_reduction_first_call_threadwise_reduce_all_dims.cpp | 8 ++++---- ...eduction_first_call_threadwise_reduce_partial_dims.cpp | 8 ++++---- ...eric_reduction_first_call_warpwise_reduce_all_dims.cpp | 8 ++++---- ..._reduction_first_call_warpwise_reduce_partial_dims.cpp | 8 ++++---- .../gridwise_generic_reduction_second_call_blockwise.cpp | 6 +++--- .../gridwise_generic_reduction_second_call_threadwise.cpp | 8 ++++---- .../gridwise_generic_reduction_second_call_warpwise.cpp | 8 ++++---- 11 files changed, 39 insertions(+), 39 deletions(-) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp index 752cec7d20c..049d720b86a 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -179,16 +179,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp index bc5eb88fff0..8df91e8c51a 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp @@ -179,16 +179,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp index 0fafff722b5..8df63f55196 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp @@ -181,16 +181,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp index 24403ba4b23..d7cf4633b93 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp @@ -180,16 +180,16 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp index 884ab6a7783..52c9a7c8fd3 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp @@ -178,12 +178,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -195,12 +195,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp index 3e15b00f10c..b9daaee1fdb 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp @@ -178,12 +178,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -195,12 +195,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp index c98d9d6bdbf..bf34ed44c84 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp @@ -179,12 +179,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -196,12 +196,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp index 434f651cf9b..43d45ad700b 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp @@ -179,12 +179,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -196,12 +196,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp index dc11200b836..f7b2f5e32c5 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp @@ -152,16 +152,16 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp index 402c4c7f1fa..826b9257abd 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp @@ -152,12 +152,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -169,12 +169,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp index 81897cfc136..483c74bdd2d 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp @@ -153,12 +153,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, make_pad_transform(toReduceLen, 0, srcPad2)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_src2dDesc) = src2dDesc; } @@ -170,12 +170,12 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc_2; } else { - if(hipThreadIdx_x == 0) + if(get_thread_local_1d_id() == 0) *static_cast(p_dst1dDesc) = dst1dDesc; } }; From 4fea4251eab462197bf865e3859044b807143d12 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Fri, 17 Sep 2021 11:54:20 +0000 Subject: [PATCH 07/13] Rename GetZeroVal() to GetReductionZeroVal() in the kernels --- .../gridwise_generic_2d_reduction_blockwise.hpp | 6 +++--- ...se_generic_2d_reduction_direct_threadwise.hpp | 6 +++--- ...wise_generic_2d_reduction_direct_warpwise.hpp | 6 +++--- .../gridwise_generic_2d_reduction_multiblock.hpp | 4 ++-- .../reduction_functions_blockwise.hpp | 4 ++-- .../reduction_functions_warpwise.hpp | 12 ++++++------ .../include/utility/reduction_operator.hpp | 16 ++++++++-------- 7 files changed, 27 insertions(+), 27 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp index de91659cf34..0ad45f50fe0 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp index 48ba8b67707..5cf7352a9b9 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise (void)ws_indices_global; (void)indices_global; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)ws_indices_global; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)origReduceLen; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp index e6831f2b6df..5fe7bfc8c95 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise (void)ws_indices_global; (void)indices_global; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)ws_indices_global; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)origReduceLen; - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp index cc65b745302..06e1930ab20 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp @@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; @@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - constexpr auto zeroVal = opReduce::GetZeroVal(); + constexpr auto zeroVal = opReduce::GetReductionZeroVal(); // LDS __shared__ compType p_in_block_values_buffer[BlockBufferSize]; diff --git a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp index 3df257a6d9b..046d3311aa7 100644 --- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp @@ -56,7 +56,7 @@ struct BlockwiseReduction_2d_block_buffer Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData) { const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); index_t offset; for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) @@ -115,7 +115,7 @@ struct BlockwiseReduction_2d_block_buffer int& accuIndex) { const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); int lAccuIndex = 0; if constexpr(blockIsOneRow) diff --git a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp index a8d5750b25d..9687d2d8c86 100644 --- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp +++ b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp @@ -62,7 +62,7 @@ struct WarpReduce // This interface implementation uses HIP built-in device shuffling functions __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); static_for<0, ThreadBufferLen, 1>{}( [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); @@ -84,7 +84,7 @@ struct WarpReduce // since for fp16, built-in shuffling functions is not provided by HIP __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); static_for<0, ThreadBufferLen, 1>{}( [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); @@ -138,7 +138,7 @@ struct WarpReduce int& accuIndex, int indexStart) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); int lAccuIndex = 0; index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize; @@ -170,7 +170,7 @@ struct WarpReduce int& accuIndex, int indexStart) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); int lAccuIndex = 0; index_t thread_id = get_thread_local_1d_id(); index_t warpId = thread_id / warpSize; @@ -278,7 +278,7 @@ struct WarpReduceWithIndicesInput compType& accuData, int& accuIndex) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); int lAccuIndex = 0; static_for<0, ThreadBufferLen, 1>{}([&](auto I) { @@ -307,7 +307,7 @@ struct WarpReduceWithIndicesInput compType& accuData, int& accuIndex) { - compType lAccuData = opReduce::GetZeroVal(); + compType lAccuData = opReduce::GetReductionZeroVal(); int lAccuIndex = 0; index_t thread_id = get_thread_local_1d_id(); index_t warpId = thread_id / warpSize; diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index edc2f41735b..75829e3c507 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -35,8 +35,8 @@ namespace reduce { // Every binary operator used in reduction is represented by a templated functor class. Each functor // class must provide at least // three members: -// 1) GetZeroVal() -- the interface to return the "identity element" for the binary operator, -// "identity element" is the unique +// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary +// operator, "identity element" is the unique // element in the algebraic space that doesn't affect the value of other elements // when operated with any of them. // 2) indexable -- boolean value indicating whether indices of the operated elements could be @@ -58,7 +58,7 @@ struct Add { using dataType = T; - __device__ static constexpr T GetZeroVal() { return static_cast(0.0f); }; + __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } @@ -70,7 +70,7 @@ struct Mul { using dataType = T; - __device__ static constexpr T GetZeroVal() { return static_cast(1.0f); }; + __device__ static constexpr T GetReductionZeroVal() { return static_cast(1.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } @@ -82,7 +82,7 @@ struct Max { using dataType = T; - __device__ static constexpr T GetZeroVal() { return NumericLimits::lowest(); }; + __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::lowest(); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -107,7 +107,7 @@ struct Min { using dataType = T; - __device__ static constexpr T GetZeroVal() { return NumericLimits::Max(); }; + __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Max(); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -132,7 +132,7 @@ struct AMax { using dataType = T; - __device__ static constexpr T GetZeroVal() { return static_cast(0.0f); }; + __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; __device__ inline constexpr void operator()(T& a, T b) const { @@ -281,7 +281,7 @@ struct unary_sqrt // The templated struct reduce_binary_operator maps the enum Ids of binary operators to their // respective functor classes. -// The "GetZeroVal()" interface and boolean member "indexable" are also provided in +// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in // reduce_binary_operactor for // easier checking by the upper-layer codes in the kernels. From 7a7497f92408b1e7dfb130a69d2324c19e3e9199 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Fri, 17 Sep 2021 13:23:32 +0000 Subject: [PATCH 08/13] Remove constexpr from initialized zeroVal and tiny fix in reduction_operator.hpp --- .../gridwise_generic_2d_reduction_blockwise.hpp | 6 +++--- .../gridwise_generic_2d_reduction_direct_threadwise.hpp | 6 +++--- .../gridwise_generic_2d_reduction_direct_warpwise.hpp | 6 +++--- .../gridwise_generic_2d_reduction_multiblock.hpp | 4 ++-- composable_kernel/include/utility/reduction_operator.hpp | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp index 0ad45f50fe0..c635da57f4d 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp @@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -243,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -431,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise __shared__ compType p_in_block_buffer[BlockBufferSize]; __shared__ int block_indices_buffer[BlockBufferSize]; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp index 5cf7352a9b9..adfeacc0374 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise (void)ws_indices_global; (void)indices_global; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -204,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)ws_indices_global; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -348,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise { (void)origReduceLen; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp index 5fe7bfc8c95..4136dae75ff 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise (void)ws_indices_global; (void)indices_global; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -215,7 +215,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)ws_indices_global; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_buf = make_dynamic_buffer( p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); @@ -373,7 +373,7 @@ struct GridwiseReduction_xy_to_x_direct_warpwise { (void)origReduceLen; - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); const auto src_global_val_buf = make_dynamic_buffer(ws_values_global, diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp index 06e1930ab20..feee2b594a3 100644 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp @@ -86,7 +86,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); // LDS __shared__ compType p_in_block_buffer[BlockBufferSize]; @@ -216,7 +216,7 @@ struct GridwiseReduction_xy_to_x_multiblock (void)alpha; // unused (void)beta; // unused - constexpr auto zeroVal = opReduce::GetReductionZeroVal(); + const auto zeroVal = opReduce::GetReductionZeroVal(); // LDS __shared__ compType p_in_block_values_buffer[BlockBufferSize]; diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index 75829e3c507..59c23e1517d 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -82,7 +82,7 @@ struct Max { using dataType = T; - __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::lowest(); }; + __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Lowest(); }; __device__ inline constexpr void operator()(T& a, T b) const { From 7218a2b7fe73089256f9353f50f0b9a78cdae2c1 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sat, 18 Sep 2021 08:08:04 +0000 Subject: [PATCH 09/13] Occasional tiny simplification and update in the kernel files --- .../include/utility/reduction_common.hpp | 23 ++----------------- ...n_first_call_blockwise_reduce_all_dims.cpp | 2 +- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/composable_kernel/include/utility/reduction_common.hpp b/composable_kernel/include/utility/reduction_common.hpp index 139a18c2a45..1b5486a0051 100644 --- a/composable_kernel/include/utility/reduction_common.hpp +++ b/composable_kernel/include/utility/reduction_common.hpp @@ -28,13 +28,6 @@ // this enumerate should be synchronized with include/miopen/reduce_common.hpp namespace ck { -enum class ReductionMethod_t -{ - DirectThreadWise = 1, - DirectWarpWise = 2, - BlockWise = 3, - MultiBlock = 4 -}; // end of namespace ck enum class ReduceTensorOp_t { @@ -71,31 +64,19 @@ enum class IndicesType_t struct float_equal_one { - template - __device__ static inline bool apply(T x) - { - return x <= type_convert{}(1.0f) and x >= type_convert{}(1.0f); - } - template __device__ inline bool operator()(T x) { - return (float_equal_one::apply(x)); + return x <= static_cast(1.0f) and x >= static_cast(1.0f); }; }; struct float_equal_zero { - template - __device__ static inline bool apply(T x) - { - return x <= type_convert{}(0.0f) and x >= type_convert{}(0.0f); - } - template __device__ inline bool operator()(T x) { - return (float_equal_zero::apply(x)); + return x <= static_cast(0.0f) and x >= static_cast(0.0f); }; }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp index 049d720b86a..271dfd34baa 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -253,7 +253,7 @@ using refType_src2dDesc_padded_34 = using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; -template +template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) { if constexpr(need_padding) From 2bc1ce054ffe76579e1e122850b46e3e78236419 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sat, 18 Sep 2021 13:35:42 +0000 Subject: [PATCH 10/13] Update to re-order tensor dimensions on the host, split second_call kernel wrapper files and simplify reduce_all kernel wrappers --- ...n_first_call_blockwise_reduce_all_dims.cpp | 70 ++---- ...rst_call_blockwise_reduce_partial_dims.cpp | 27 +- ..._first_call_multiblock_reduce_all_dims.cpp | 71 ++---- ...st_call_multiblock_reduce_partial_dims.cpp | 27 +- ..._first_call_threadwise_reduce_all_dims.cpp | 72 ++---- ...st_call_threadwise_reduce_partial_dims.cpp | 27 +- ...on_first_call_warpwise_reduce_all_dims.cpp | 45 +--- ...irst_call_warpwise_reduce_partial_dims.cpp | 27 +- ..._second_call_blockwise_reduce_all_dims.cpp | 213 ++++++++++++++++ ...nd_call_blockwise_reduce_partial_dims.cpp} | 19 +- ...second_call_threadwise_reduce_all_dims.cpp | 230 ++++++++++++++++++ ...d_call_threadwise_reduce_partial_dims.cpp} | 7 +- ...n_second_call_warpwise_reduce_all_dims.cpp | 229 +++++++++++++++++ ...ond_call_warpwise_reduce_partial_dims.cpp} | 17 +- 14 files changed, 774 insertions(+), 307 deletions(-) create mode 100644 composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp rename composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_blockwise.cpp => gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp} (94%) create mode 100644 composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp rename composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_threadwise.cpp => gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp} (97%) create mode 100644 composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp rename composable_kernel/src/kernel_wrapper/{gridwise_generic_reduction_second_call_warpwise.cpp => gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp} (95%) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp index 271dfd34baa..089e775152e 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; +using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge, toReduceDims>::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced static_assert(dstDims == 1, "If all source dimensions are reduced, the dest should have only one dimension !!"); @@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, void* __restrict__ ws_global) { (void)GridSize; @@ -132,18 +113,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); const auto one_dim_srcDesc = transform_tensor_descriptor( srcDesc, @@ -157,14 +134,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + constexpr int invariantLen = 1; + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; @@ -189,20 +160,18 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, } if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; + *static_cast(p_dst1dDesc) = dstDesc; }; -template +template struct get_ref_desc_types { static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; // don't have to use accurate strides to get an expected referrence type static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( ref_srcDesc, @@ -217,12 +186,6 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); @@ -235,23 +198,20 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}, Sequence<1>{}))); using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, + decltype(transform_tensor_descriptor(ref_dstDesc, make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{}))); using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types::refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp index 8df91e8c51a..a3daeaf1639 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, int outStride0, int outStride1, int outStride2, @@ -133,14 +122,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; const int dstStrides[6] = { outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp index 8df63f55196..a425af0dfff 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp @@ -45,8 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty +using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge, toReduceDims>::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced static_assert(dstDims == 1, "If all source dimensions are reduced, the dest should have only one dimension !!"); @@ -111,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, void* __restrict__ ws_global) { (void)GridSize; @@ -132,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); const auto one_dim_srcDesc = transform_tensor_descriptor( srcDesc, @@ -157,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + constexpr int invariantLen = 1; + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; const index_t reduceSizePerBlock = @@ -191,20 +161,18 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, } if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; + *static_cast(p_dst1dDesc) = dstDesc; }; -template +template struct get_ref_desc_types { static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; // don't have to use accurate strides to get an expected referrence type static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( ref_srcDesc, @@ -219,12 +187,6 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); @@ -237,23 +199,20 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}, Sequence<1>{}))); using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, + decltype(transform_tensor_descriptor(ref_dstDesc, make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{}))); using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types::refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp index d7cf4633b93..0e578f4d1d8 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, int outStride0, int outStride1, int outStride2, @@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; const int dstStrides[6] = { outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp index 52c9a7c8fd3..6dd806567cb 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp @@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; +using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge, toReduceDims>::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced static_assert(dstDims == 1, "If all source dimensions are reduced, the dest should have only one dimension !!"); @@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, void* __restrict__ ws_global) { (void)BlkGroupSize; @@ -131,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); const auto one_dim_srcDesc = transform_tensor_descriptor( srcDesc, @@ -156,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + constexpr int invariantLen = 1; + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); constexpr auto copySliceLen = GredThreadBufferLength; @@ -191,7 +162,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, { const auto dstPad = GridSize * BlockSize - invariantLen; auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, + transform_tensor_descriptor(dstdDesc, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); @@ -201,21 +172,19 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, else { if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; + *static_cast(p_dst1dDesc) = dstDesc; } }; -template +template struct get_ref_desc_types { static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; // don't have to use accurate strides to get an expected referrence type static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( ref_srcDesc, @@ -230,12 +199,6 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); @@ -248,23 +211,20 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}, Sequence<1>{}))); using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, + decltype(transform_tensor_descriptor(ref_dstDesc, make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{}))); using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types::refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp index b9daaee1fdb..698f740058f 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, int outStride0, int outStride1, int outStride2, @@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; const int dstStrides[6] = { outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp index bf34ed44c84..e4387151d50 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp @@ -45,7 +45,7 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; +using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,13 +58,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge, toReduceDims>::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced static_assert(dstDims == 1, "If all source dimensions are reduced, the dest should have only one dimension !!"); @@ -110,18 +103,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, void* __restrict__ ws_global) { (void)BlkGroupSize; @@ -131,18 +112,14 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); const auto one_dim_srcDesc = transform_tensor_descriptor( srcDesc, @@ -156,14 +133,8 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); + constexpr int invariantLen = 1; + const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; @@ -192,7 +163,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, { const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, + transform_tensor_descriptor(dstDesc, make_tuple(make_pad_transform(invariantLen, 0, dstPad)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{})); @@ -202,7 +173,7 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, else { if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; + *static_cast(p_dst1dDesc) = dstDesc; } }; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp index 43d45ad700b..a6415279006 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -111,12 +106,6 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, int inStride3, int inStride4, int inStride5, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, int outStride0, int outStride1, int outStride2, @@ -132,14 +121,12 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; const int dstStrides[6] = { outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); + const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp new file mode 100644 index 00000000000..fd3350d43d5 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp @@ -0,0 +1,213 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_blockwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable + +extern "C" __global__ void +gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) +{ + (void)GridSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); + + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const index_t invariantLen = dstDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; + + if constexpr(src2d_need_padding) + { + const auto srcPad = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pass_through_transform(invariantLen), + make_pad_transform(toReduceLen, 0, srcPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if(get_thread_local_1d_id() == 0) + *static_cast(p_dst1dDesc) = dstDesc; +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = make_tuple(8); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); + + // used by the BlockWise and MultiBlock method + using refType_src2dDesc_padded_34 = decltype( + transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pass_through_transform(ref_invariantLen), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dstDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + const void CONSTANT* ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp similarity index 94% rename from composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp rename to composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp index f7b2f5e32c5..7fc2ce48ce9 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,11 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp new file mode 100644 index 00000000000..7e8c43797e8 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp @@ -0,0 +1,230 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +using toReduceDims = Sequence; +using invariantDims = Sequence; // this could be empty + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable + +extern "C" __global__ void +gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); + + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const index_t invariantLen = dstDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = GredThreadBufferLength; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dstDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(get_thread_local_1d_id() == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(get_thread_local_1d_id() == 0) + *static_cast(p_dst1dDesc) = dstDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = make_tuple(8); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dstDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + const void CONSTANT* ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp similarity index 97% rename from composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp rename to composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp index 826b9257abd..8d8755b6150 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp new file mode 100644 index 00000000000..328e65012b0 --- /dev/null +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp @@ -0,0 +1,229 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "config.hpp" +#include "number.hpp" +#include "sequence.hpp" +#include "tensor_descriptor_helper.hpp" +#include "data_type_enum_helper.hpp" +#include "reduction_common.hpp" +#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" + +using namespace ck; + +using srcDataType = + typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; +using dstDataType = + typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; +using compType = + typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; + +constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable + +constexpr index_t srcDims = CK_PARAM_IN_DIMS; +constexpr index_t dstDims = CK_PARAM_OUT_DIMS; + +constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); +constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 + ? NanPropagation_t::NOT_PROPAGATE_NAN + : NanPropagation_t::PROPAGATE_NAN; +constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 + ? ReduceTensorIndices_t::NO_INDICES + : ReduceTensorIndices_t::FLATTENED_INDICES; + +constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); +constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); + +static_assert(dstDims == 1, + "If all source dimensions are reduced, the dest should have only one dimension !!"); + +constexpr bool indexable = reduce_binary_operator::indexable; +constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); + +constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable + +extern "C" __global__ void +gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) +{ + (void)BlkGroupSize; + + void* p_src2dDesc = ws_global; + void* p_dst1dDesc = static_cast(ws_global) + 2048; + + const auto tupleDstLengths = make_tuple(1); + const auto tupleDstStrides = make_tuple(1); + + auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); + + const index_t invariantLen = dstDesc.GetLength(Number<0>{}); + const index_t toReduceLen = BlkGroupSize; + + auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); + + constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; + + if constexpr(src2d_need_padding) + { + const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; + const auto srcPad2 = + ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; + + auto src2dDesc_2 = + transform_tensor_descriptor(src2dDesc, + make_tuple(make_pad_transform(invariantLen, 0, srcPad1), + make_pad_transform(toReduceLen, 0, srcPad2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc_2; + } + else + { + if(get_thread_local_1d_id() == 0) + *static_cast(p_src2dDesc) = src2dDesc; + } + + if constexpr(dst1d_need_padding) + { + const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; + auto dst1dDesc_2 = + transform_tensor_descriptor(dstDesc, + make_tuple(make_pad_transform(invariantLen, 0, dstPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + if(get_thread_local_1d_id() == 0) + *static_cast(p_dst1dDesc) = dst1dDesc_2; + } + else + { + if(get_thread_local_1d_id() == 0) + *static_cast(p_dst1dDesc) = dstDesc; + } +}; + +template +struct get_ref_desc_types +{ + static constexpr auto ref_tupleDstLengths = make_tuple(8); + static constexpr auto ref_dstDesc = + make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); + + static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); + static constexpr index_t ref_toReduceLen = 8; + + static constexpr auto ref_src2dDesc = + make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); + + using refType_src2dDesc = decltype(ref_src2dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); + + // used by the DirectThreadWise and DirectWarpWise method + using refType_src2dDesc_padded_12 = + decltype(transform_tensor_descriptor(ref_src2dDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2), + make_pad_transform(ref_toReduceLen, 0, 2)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}))); + + using refType_dst1dDesc_padded = + decltype(transform_tensor_descriptor(ref_dstDesc, + make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{}))); +}; + +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; + +template +static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_src2dDesc)); + else + return (*reinterpret_cast(p_src2dDesc)); +}; + +template +static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) +{ + if constexpr(need_padding) + return (*reinterpret_cast(p_dst1dDesc)); + else + return (*reinterpret_cast(p_dst1dDesc)); +}; + +extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, + float alpha, + const void* __restrict__ p_src_global, + float beta, + void* __restrict__ p_dst_global, + const void CONSTANT* ws_global, + long ws_buf2_bytes_offset, + void* __restrict__ indices_global) +{ + (void)p_src_global; + + const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); + const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; + void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); + + const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); + const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); + + using gridwise_2d_reduce = + GridwiseReduction_xy_to_x_direct_warpwise; + + void* const ws_buf2_global = + ws_buf2_bytes_offset > 0 + ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) + : nullptr; + + constexpr int RunId = need_indices ? 3 : 1; + gridwise_2d_reduce::template Run( + src2dDesc, + dst1dDesc, + origReduceLen, + alpha, + static_cast(ws_buf1_global), + beta, + static_cast(p_dst_global), + static_cast(ws_buf2_global), + static_cast(indices_global)); +}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp similarity index 95% rename from composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp rename to composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp index 483c74bdd2d..612f33f006b 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp @@ -45,8 +45,11 @@ constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty +constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; +constexpr index_t num_invariantDims = srcDims - num_toReduceDims; + +using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; +using toReduceDims = typename arithmetic_sequence_gen::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -59,15 +62,7 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); +static_assert(num_toReduceDims > 0, "At least one dimension need be reduced!!!"); constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); From e030a286567d9e0150d8632fa7fa3e42cbe0c689 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sat, 18 Sep 2021 14:15:44 +0000 Subject: [PATCH 11/13] Update to remove OpenCL tidy checking failures --- .../include/utility/reduction_common.hpp | 36 +--------- .../include/utility/reduction_enums.hpp | 66 +++++++++++++++++++ 2 files changed, 68 insertions(+), 34 deletions(-) create mode 100644 composable_kernel/include/utility/reduction_enums.hpp diff --git a/composable_kernel/include/utility/reduction_common.hpp b/composable_kernel/include/utility/reduction_common.hpp index 1b5486a0051..ff574c315c1 100644 --- a/composable_kernel/include/utility/reduction_common.hpp +++ b/composable_kernel/include/utility/reduction_common.hpp @@ -26,41 +26,9 @@ #ifndef CK_REDUCTION_COMMON_HPP #define CK_REDUCTION_COMMON_HPP -// this enumerate should be synchronized with include/miopen/reduce_common.hpp -namespace ck { - -enum class ReduceTensorOp_t -{ - ADD = 0, - MUL = 1, - MIN = 2, - MAX = 3, - AMAX = 4, - AVG = 5, - NORM1 = 6, - NORM2 = 7, - // MUL_NO_ZEROS = 8, -}; - -enum class NanPropagation_t -{ - NOT_PROPAGATE_NAN = 0, - PROPAGATE_NAN = 1, -}; +#include "reduction_enums.hpp" -enum class ReduceTensorIndices_t -{ - NO_INDICES = 0, - FLATTENED_INDICES = 1, -}; - -enum class IndicesType_t -{ - INDICES_32BIT = 0, - INDICES_64BIT = 1, - INDICES_16BIT = 2, - INDICES_8BIT = 3, -}; +namespace ck { struct float_equal_one { diff --git a/composable_kernel/include/utility/reduction_enums.hpp b/composable_kernel/include/utility/reduction_enums.hpp new file mode 100644 index 00000000000..e97108179ea --- /dev/null +++ b/composable_kernel/include/utility/reduction_enums.hpp @@ -0,0 +1,66 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef CK_REDUCTION_ENUMS_HPP +#define CK_REDUCTION_ENUMS_HPP + +namespace ck { + +enum class ReduceTensorOp_t +{ + ADD = 0, + MUL = 1, + MIN = 2, + MAX = 3, + AMAX = 4, + AVG = 5, + NORM1 = 6, + NORM2 = 7, + // MUL_NO_ZEROS = 8, +}; + +enum class NanPropagation_t +{ + NOT_PROPAGATE_NAN = 0, + PROPAGATE_NAN = 1, +}; + +enum class ReduceTensorIndices_t +{ + NO_INDICES = 0, + FLATTENED_INDICES = 1, +}; + +enum class IndicesType_t +{ + INDICES_32BIT = 0, + INDICES_64BIT = 1, + INDICES_16BIT = 2, + INDICES_8BIT = 3, +}; + +}; // end of namespace ck + +#endif From 1b9c39be7fe77cffbc91b57e96a9b2e599007171 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Fri, 24 Sep 2021 07:37:19 +0000 Subject: [PATCH 12/13] Update for better readability --- composable_kernel/include/utility/reduction_operator.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp index 59c23e1517d..c0afbec8695 100644 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ b/composable_kernel/include/utility/reduction_operator.hpp @@ -38,7 +38,9 @@ namespace reduce { // 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary // operator, "identity element" is the unique // element in the algebraic space that doesn't affect the value of other elements -// when operated with any of them. +// when operated against them, and the concept is similar to zero vector in +// vector space +// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf). // 2) indexable -- boolean value indicating whether indices of the operated elements could be // recorded. Usually, Min/Max operator could // need to record the indices of elements. For operator like Add/Mul, no need to From c36084f591b2f6bdf6e5b0157d898d979faca35d Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Mon, 27 Sep 2021 09:40:11 +0000 Subject: [PATCH 13/13] Remove unused codes and not-needed template parameters in the kernel wrappers --- ...n_first_call_blockwise_reduce_all_dims.cpp | 6 ---- ..._first_call_multiblock_reduce_all_dims.cpp | 6 ---- ..._first_call_threadwise_reduce_all_dims.cpp | 6 ---- ...on_first_call_warpwise_reduce_all_dims.cpp | 34 +++++-------------- ..._second_call_blockwise_reduce_all_dims.cpp | 16 +++------ ...ond_call_blockwise_reduce_partial_dims.cpp | 28 +++------------ ...second_call_threadwise_reduce_all_dims.cpp | 16 +++------ ...nd_call_threadwise_reduce_partial_dims.cpp | 32 +++-------------- ...n_second_call_warpwise_reduce_all_dims.cpp | 16 +++------ ...cond_call_warpwise_reduce_partial_dims.cpp | 24 +++---------- 10 files changed, 35 insertions(+), 149 deletions(-) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp index 089e775152e..ca6b415910e 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp @@ -43,9 +43,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp index a425af0dfff..81899dfb021 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp @@ -43,9 +43,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp index 6dd806567cb..e63a1254e4d 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp @@ -43,9 +43,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp index e4387151d50..4a607372e95 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp @@ -43,9 +43,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -using toReduceDims = typename arithmetic_sequence_gen<0, srcDims, 1>::type; constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 @@ -58,9 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -177,17 +171,15 @@ extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, } }; -template +template struct get_ref_desc_types { static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; // don't have to use accurate strides to get an expected referrence type static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); + static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( ref_srcDesc, @@ -202,12 +194,6 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}), make_tuple(Sequence<0, 1>{})); - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); @@ -220,23 +206,19 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}, Sequence<1>{}))); using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, + decltype(transform_tensor_descriptor(ref_dstDesc, make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), make_tuple(Sequence<0>{}), make_tuple(Sequence<0>{}))); using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); + using refType_dst1dDesc = decltype(ref_dstDesc); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types::refType_dst1dDesc_padded; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp index fd3350d43d5..7e9d46612ef 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp @@ -42,9 +42,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 ? NanPropagation_t::NOT_PROPAGATE_NAN @@ -56,9 +53,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -108,7 +102,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri *static_cast(p_dst1dDesc) = dstDesc; }; -template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = make_tuple(8); @@ -139,11 +132,10 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp index 7fc2ce48ce9..3f37d01e21e 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp @@ -42,15 +42,8 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 ? NanPropagation_t::NOT_PROPAGATE_NAN @@ -62,12 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -164,7 +151,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, *static_cast(p_dst1dDesc) = dst1dDesc; }; -template +template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = @@ -202,16 +189,11 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_34 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_34; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp index 7e8c43797e8..77841d1312b 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp @@ -42,9 +42,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - using toReduceDims = Sequence; using invariantDims = Sequence; // this could be empty @@ -59,9 +56,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -125,7 +119,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri } }; -template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = make_tuple(8); @@ -156,11 +149,10 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp index 8d8755b6150..2de461ad0fa 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp @@ -42,15 +42,8 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 ? NanPropagation_t::NOT_PROPAGATE_NAN @@ -62,16 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -//////////////////////////////////////////////////////////////////////////////////////// -using specDims = typename sequence_merge::type; - -static_assert(is_valid_sequence_map::value && specDims::Size() == srcDims, - "Wrong invariant and/or toReduce dimensions!"); - -// The number of invariant dimensions can be zero if all dimension are to be reduced -static_assert(invariantDims::Size() > 0 || dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -182,7 +165,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, } }; -template +template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = @@ -220,16 +203,11 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp index 328e65012b0..1ba5e496579 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp @@ -42,9 +42,6 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 ? NanPropagation_t::NOT_PROPAGATE_NAN @@ -56,9 +53,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(dstDims == 1, - "If all source dimensions are reduced, the dest should have only one dimension !!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -123,7 +117,6 @@ gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restri } }; -template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = make_tuple(8); @@ -154,11 +147,10 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp index 612f33f006b..aef1545f118 100644 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp +++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp @@ -42,15 +42,8 @@ using compType = constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable -constexpr index_t srcDims = CK_PARAM_IN_DIMS; constexpr index_t dstDims = CK_PARAM_OUT_DIMS; -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 ? NanPropagation_t::NOT_PROPAGATE_NAN @@ -62,8 +55,6 @@ constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); -static_assert(num_toReduceDims > 0, "At least one dimension need be reduced!!!"); - constexpr bool indexable = reduce_binary_operator::indexable; constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); @@ -175,7 +166,7 @@ extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, } }; -template +template struct get_ref_desc_types { static constexpr auto ref_tupleDstLengths = @@ -213,16 +204,11 @@ struct get_ref_desc_types make_tuple(Sequence<0>{}))); }; -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; +using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; +using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; + typename get_ref_desc_types::refType_src2dDesc_padded_12; +using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; template static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)