From f2930add140fa8cbe22797b2cadf565189d590a1 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 6 Feb 2023 06:30:25 -0500 Subject: [PATCH 01/19] Sync the order of type string with template parameter --- .../gpu/device/impl/device_normalization_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp index 8cc223a8866..8f06d7a061e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp @@ -462,8 +462,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization"; // clang-format on From b53db56e1d2fd3c5c433e0386952b0942ff03903 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 6 Feb 2023 07:23:23 -0500 Subject: [PATCH 02/19] Add more instances --- .../device_normalization_f16_instance.cpp | 25 +++++++++++-------- .../device_normalization_f32_instance.cpp | 23 +++++++++++------ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp index 8994d9dcb6e..2eecd1efc4b 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -22,19 +22,24 @@ template using device_normalization_f16_instances = std::tuple < // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // fallback kernel - DeviceNormalizationImpl, // fallback kernel - DeviceNormalizationImpl, // fallback kernel - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl >; // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp index 4a7e1fd0b94..657f5ebc5d6 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -20,16 +20,23 @@ template using device_layernorm_f32_instances = std::tuple< // clang-format off // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // fallback kernel - DeviceNormalizationImpl, // fallback kernel - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, - DeviceNormalizationImpl + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl // clang-format on >; From 73d26a88f7514c237fbcfc3016a48d675eec6fac Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 7 Feb 2023 14:35:26 -0500 Subject: [PATCH 03/19] Check the vector size and remove redundant var --- ...ridwise_normalization_welford_variance.hpp | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index 70a8c020ddc..b6d4c09c962 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -43,6 +43,10 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0), "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + static_assert(XSrcVectorSize == YDstVectorSize); + static_assert(XSrcVectorSize == GammaSrcVectorSize); + static_assert(XSrcVectorSize == BetaSrcVectorSize); + static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0); using ThreadClusterLengths_M_K = Sequence; @@ -77,10 +81,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; - static constexpr auto XThreadBufferNumber = Number{}; - static constexpr auto GammaThreadBufferNumber = Number{}; - static constexpr auto BetaThreadBufferNumber = Number{}; - static constexpr auto YThreadBufferNumber = Number{}; + static constexpr auto ThreadBufferNumber = Number{}; __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, int thread_k_cluster_id) @@ -93,7 +94,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk if(kPerBlockTail > 0) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { int thread_max_len = (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i; int delta = thread_max_len - kPerBlockTail; @@ -132,7 +133,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk MThreadSliceSize * XSrcVectorSize, true>{}; }, - Number{}); + Number{}); auto gamma_thread_buf = generate_tuple( [&](auto) { @@ -141,7 +142,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk MThreadSliceSize * GammaSrcVectorSize, true>{}; }, - Number{}); + Number{}); auto beta_thread_buf = generate_tuple( [&](auto) { @@ -150,7 +151,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk MThreadSliceSize * BetaSrcVectorSize, true>{}; }, - Number{}); + Number{}); auto y_thread_buf = generate_tuple( [&](auto) { @@ -159,7 +160,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk MThreadSliceSize * YDstVectorSize, true>{}; }, - Number{}); + Number{}); StaticBuffer mean_thread_buf; StaticBuffer var_thread_buf; @@ -266,7 +267,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_x_load.Run(x_grid_desc_m_k, x_global_val_buf, thread_buffer_desc_m_k, @@ -286,7 +287,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }); auto thread_copy_tail_m_k = - (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k; + (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k; threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); @@ -297,7 +298,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk { if constexpr(!SweepOnce) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_x_load.Run(x_grid_desc_m_k, x_global_val_buf, thread_buffer_desc_m_k, @@ -307,7 +308,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }); } - static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_gamma_load.Run(gamma_grid_desc_m_k, gamma_global_val_buf, thread_buffer_desc_m_k, @@ -320,7 +321,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); @@ -338,7 +339,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }); }); - static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_beta_load.Run(beta_grid_desc_m_k, beta_global_val_buf, thread_buffer_desc_m_k, @@ -349,7 +350,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); @@ -362,7 +363,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }); }); - static_for<0, YThreadBufferNumber, 1>{}([&](auto i) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_y_store.Run(thread_buffer_desc_m_k, make_tuple(I0, I0), y_thread_buf(i), From e12a6be2c2e84099b9869e56afe82f6a43719f58 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 7 Feb 2023 14:45:04 -0500 Subject: [PATCH 04/19] Extract var to static, prepare to separate sweep once kernel --- .../grid/gridwise_normalization_welford_variance.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index b6d4c09c962..a3ff3fd2e8d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -60,6 +60,10 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk static constexpr auto thread_cluster_desc = make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + using ThreadBufferLengths_M_K = Sequence; + static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}))); using ThreadReduceDstDesc_M = @@ -81,7 +85,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; - static constexpr auto ThreadBufferNumber = Number{}; + static constexpr auto ThreadBufferNumber = Number{}; __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k, int thread_k_cluster_id) @@ -174,10 +178,6 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk const auto thread_m_cluster_id = thread_cluster_idx[I0]; const auto thread_k_cluster_id = thread_cluster_idx[I1]; - using ThreadBufferLengths_M_K = Sequence; - constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2 Date: Wed, 8 Feb 2023 14:38:18 -0500 Subject: [PATCH 05/19] Separate sweeponce flow and optimize the flow --- ...ridwise_normalization_welford_variance.hpp | 220 ++++++++++++------ 1 file changed, 149 insertions(+), 71 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index a3ff3fd2e8d..23e10b7f401 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -265,7 +265,8 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk var_thread_buf(I) = type_convert(0.0f); }); - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + // Separate sweep once and sweep twice pipeline + if constexpr(SweepOnce) { static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_x_load.Run(x_grid_desc_m_k, @@ -273,55 +274,43 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk thread_buffer_desc_m_k, make_tuple(I0, I0), x_thread_buf(i)); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); - threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf); - }); - } - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if constexpr(I > 0) - block_sync_lds(); - int count = threadwise_welford.cur_count_; - BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); - }); - - auto thread_copy_tail_m_k = - (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k; - - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); - - for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) - { - if constexpr(!SweepOnce) - { - static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { - threadwise_x_load.Run(x_grid_desc_m_k, - x_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf(i)); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); - }); - } - - static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { threadwise_gamma_load.Run(gamma_grid_desc_m_k, gamma_global_val_buf, thread_buffer_desc_m_k, make_tuple(I0, I0), gamma_thread_buf(i)); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - thread_copy_fwd_step_m_k); + threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf); + + if constexpr(i != ThreadBufferNumber - 1) + { + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + thread_copy_fwd_step_m_k); + } + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + int count = threadwise_welford.cur_count_; + BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); }); static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(iK0)); + if constexpr(iK0 != ThreadBufferNumber - 1) + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); @@ -331,33 +320,10 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * divisor; - // gamma + // gamma & beta y_thread_buf(iK0)(Number{}) = y_thread_buf(iK0)(Number{}) * - gamma_thread_buf(iK0)(Number{}); - }); - }); - }); - - static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf(i)); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - thread_copy_fwd_step_m_k); - }); - - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); - - // beta - y_thread_buf(iK0)(Number{}) = - y_thread_buf(iK0)(Number{}) + + gamma_thread_buf(iK0)(Number{}) + beta_thread_buf(iK0)(Number{}); }); }); @@ -369,16 +335,128 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk y_thread_buf(i), y_grid_desc_m_k, y_global_val_buf); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k); + + if constexpr(i != ThreadBufferNumber - 1) + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + thread_copy_fwd_step_m_k); }); + } // end of sweep once + else + { + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf); + }); + } - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - 2 * thread_copy_bwd_step_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); - } + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + int count = threadwise_welford.cur_count_; + BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); + }); + + auto thread_copy_tail_m_k = + (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k; + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_gamma_load.Run(gamma_grid_desc_m_k, + gamma_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + gamma_thread_buf(i)); + + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // normalize + y_thread_buf(iK0)(Number{}) = + (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * + divisor; + + // gamma + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) * + gamma_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(i)); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) + + beta_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_y_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + y_thread_buf(i), + y_grid_desc_m_k, + y_global_val_buf); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + } + } // end of sweep twice } }; From 9d2280d66cef32ea1ab7bbe9674c2e26c5f2742d Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 10 Feb 2023 04:58:08 -0500 Subject: [PATCH 06/19] 1. Rename AccDatatype in normalization to computeData 2. Rename AccElementwiseOperation to YElementwiseOperation in normalization --- client_example/05_layernorm/layernorm2d.cpp | 14 +++--- example/27_layernorm/layernorm_blockwise.cpp | 16 +++--- .../42_groupnorm/groupnorm_sigmoid_fp16.cpp | 14 +++--- .../gpu/device/device_normalization.hpp | 14 +++--- .../device/impl/device_normalization_impl.hpp | 50 +++++++++---------- ...ridwise_normalization_welford_variance.hpp | 42 ++++++++-------- .../device_normalization_f16_instance.cpp | 2 +- .../device_normalization_f32_instance.cpp | 2 +- .../profiler/profile_layernorm_impl.hpp | 6 +-- test/normalization/test_groupnorm_fp16.cpp | 14 +++--- test/normalization/test_groupnorm_fp32.cpp | 14 +++--- test/normalization/test_layernorm2d_fp16.cpp | 14 +++--- test/normalization/test_layernorm2d_fp32.cpp | 14 +++--- 13 files changed, 109 insertions(+), 107 deletions(-) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp index adb41171e12..856a4cc2193 100644 --- a/client_example/05_layernorm/layernorm2d.cpp +++ b/client_example/05_layernorm/layernorm2d.cpp @@ -12,12 +12,12 @@ #include "ck/library/tensor_operation_instance/gpu/normalization.hpp" -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using AccDataType = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; constexpr int Rank = 2; constexpr int NumReduceDim = 1; @@ -54,7 +54,7 @@ int main(int argc, char* argv[]) using DeviceOp = ck::tensor_operation::device::DeviceNormalization; diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp index e62001d6692..d283c656607 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -23,11 +23,11 @@ constexpr int Rank = 5; constexpr int NumReduceDim = 3; -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using AccDataType = float; +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ConputeDataType = float; struct YElementOp { @@ -50,7 +50,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationImpl; ReferenceInstance ref; diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp index ec17ec3d18c..4739cca3dbb 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp @@ -14,9 +14,9 @@ namespace device { template struct DeviceNormalization : public BaseOperator @@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator void* p_y, void* p_savedMean, void* p_savedInvVar, - AccElementwiseOperation acc_elementwise_op) = 0; + YElementwiseOperation y_elementwise_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; @@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator template using DeviceNormalizationPtr = std::unique_ptr>; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp index 8f06d7a061e..0353051021d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp @@ -21,20 +21,20 @@ template __global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, - AccDataType epsilon, + ConputeDataType epsilon, const XDataType* const __restrict__ p_x_global, const GammaDataType* const __restrict__ p_gamma_global, const BetaDataType* const __restrict__ p_beta_global, YDataType* const __restrict__ p_y_global, - const AccElementwiseOperation acc_elementwise_op) + const YElementwiseOperation y_elementwise_op) { GridwiseReduction::Run(x_grid_desc_m_k, gamma_grid_desc_m_k, @@ -46,7 +46,7 @@ __global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k, p_gamma_global, p_beta_global, p_y_global, - acc_elementwise_op); + y_elementwise_op); }; } // namespace ck @@ -58,9 +58,9 @@ namespace device { template { @@ -172,8 +172,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization betaStrides, const std::vector yStrides, const std::vector reduceDims, - AccElementwiseOperation acc_elementwise_op, + YElementwiseOperation y_elementwise_op, double epsilon, const XDataType* p_x, const GammaDataType* p_gamma, @@ -230,9 +230,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization(epsilon); + epsilon_ = static_cast(epsilon); Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); xStrides_ = shuffle_tensor_dimensions(xStrides, reduceDims); @@ -265,7 +265,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization{}) <= KThreadClusterSize * KThreadSliceSize; } - AccDataType epsilon_; + ConputeDataType epsilon_; const XDataType* p_x_; const GammaDataType* p_gamma_; @@ -278,7 +278,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization betaStrides_; std::vector yStrides_; - AccElementwiseOperation acc_elementwise_op_; + YElementwiseOperation y_elementwise_op_; int blkGroupSize_; int numBlockTileIteration_; @@ -301,16 +301,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization : kernel_normalization; float avg_time = 0; @@ -329,7 +329,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization(p_x), static_cast(p_gamma), diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index 23e10b7f401..34a142bdf38 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -16,8 +16,8 @@ template {}))); using ThreadwiseWelford = - ThreadwiseWelford; + ThreadwiseWelford; - using BlockwiseWelford = BlockwiseWelford; @@ -115,12 +115,12 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk const GridDesc_M_K& beta_grid_desc_m_k, const GridDesc_M_K& y_grid_desc_m_k, index_t num_k_block_tile_iteration, - AccDataType epsilon, + ComputeDataType epsilon, const XDataType* const __restrict__ p_x_global, const GammaDataType* const __restrict__ p_gamma_global, const BetaDataType* const __restrict__ p_beta_global, YDataType* const __restrict__ p_y_global, - const AccElementwiseOperation acc_elementwise_op) + const YElementwiseOperation y_elementwise_op) { if constexpr(SweepOnce) { @@ -133,7 +133,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk auto x_thread_buf = generate_tuple( [&](auto) { return StaticBuffer{}; }, @@ -142,7 +142,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk auto gamma_thread_buf = generate_tuple( [&](auto) { return StaticBuffer{}; }, @@ -151,7 +151,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk auto beta_thread_buf = generate_tuple( [&](auto) { return StaticBuffer{}; }, @@ -160,14 +160,16 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk auto y_thread_buf = generate_tuple( [&](auto) { return StaticBuffer{}; }, Number{}); - StaticBuffer mean_thread_buf; - StaticBuffer var_thread_buf; + StaticBuffer + mean_thread_buf; + StaticBuffer + var_thread_buf; const index_t thread_local_id = get_thread_local_1d_id(); const index_t block_global_id = get_block_1d_id(); @@ -179,7 +181,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk const auto thread_k_cluster_id = thread_cluster_idx[I1]; auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2{}([&](auto I) { - mean_thread_buf(I) = type_convert(0.0f); - var_thread_buf(I) = type_convert(0.0f); + mean_thread_buf(I) = type_convert(0.0f); + var_thread_buf(I) = type_convert(0.0f); }); // Separate sweep once and sweep twice pipeline diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp index 2eecd1efc4b..b24ee9f9f44 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -21,7 +21,7 @@ template // clang-format off using device_normalization_f16_instances = std::tuple < - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp index 657f5ebc5d6..188f8046a4a 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -19,7 +19,7 @@ using Pass = ck::tensor_operation::element_wise::PassThrough; template using device_layernorm_f32_instances = std::tuple< // clang-format off - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp index eb21d4a5860..ad7a352e290 100644 --- a/profiler/include/profiler/profile_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_impl.hpp @@ -19,7 +19,7 @@ namespace profiler { template bool profile_layernorm_impl(int do_verification, @@ -86,7 +86,7 @@ bool profile_layernorm_impl(int do_verification, using DeviceOp = ck::tensor_operation::device::DeviceNormalization; diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp index 636e522dce3..60d3b13959f 100644 --- a/test/normalization/test_groupnorm_fp16.cpp +++ b/test/normalization/test_groupnorm_fp16.cpp @@ -12,11 +12,11 @@ template class TestGroupnorm : public ::testing::Test { protected: - using XDataType = std::tuple_element_t<0, Tuple>; - using GammaDataType = std::tuple_element_t<1, Tuple>; - using BetaDataType = std::tuple_element_t<2, Tuple>; - using AccDataType = std::tuple_element_t<3, Tuple>; - using YDataType = std::tuple_element_t<4, Tuple>; + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; void Run() { @@ -36,7 +36,7 @@ class TestGroupnorm : public ::testing::Test ck::profiler::profile_groupnorm_impl(true, 2, false, false, length); EXPECT_TRUE(success); } @@ -44,7 +44,7 @@ class TestGroupnorm : public ::testing::Test }; using KernelTypes = ::testing::Types< - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> std::tuple>; TYPED_TEST_SUITE(TestGroupnorm, KernelTypes); diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp index ef492664bff..3542f73a62f 100644 --- a/test/normalization/test_groupnorm_fp32.cpp +++ b/test/normalization/test_groupnorm_fp32.cpp @@ -12,11 +12,11 @@ template class TestGroupnorm : public ::testing::Test { protected: - using XDataType = std::tuple_element_t<0, Tuple>; - using GammaDataType = std::tuple_element_t<1, Tuple>; - using BetaDataType = std::tuple_element_t<2, Tuple>; - using AccDataType = std::tuple_element_t<3, Tuple>; - using YDataType = std::tuple_element_t<4, Tuple>; + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; void Run() { @@ -34,7 +34,7 @@ class TestGroupnorm : public ::testing::Test ck::profiler::profile_groupnorm_impl(true, 2, false, false, length); EXPECT_TRUE(success); } @@ -42,7 +42,7 @@ class TestGroupnorm : public ::testing::Test }; using KernelTypes = ::testing::Types< - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> std::tuple>; TYPED_TEST_SUITE(TestGroupnorm, KernelTypes); diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp index eeb8ec150ac..d627cbe7f11 100644 --- a/test/normalization/test_layernorm2d_fp16.cpp +++ b/test/normalization/test_layernorm2d_fp16.cpp @@ -12,11 +12,11 @@ template class TestLayernorm2d : public ::testing::Test { protected: - using XDataType = std::tuple_element_t<0, Tuple>; - using GammaDataType = std::tuple_element_t<1, Tuple>; - using BetaDataType = std::tuple_element_t<2, Tuple>; - using AccDataType = std::tuple_element_t<3, Tuple>; - using YDataType = std::tuple_element_t<4, Tuple>; + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; void Run() { @@ -29,7 +29,7 @@ class TestLayernorm2d : public ::testing::Test bool success = ck::profiler::profile_layernorm_impl(true, 2, false, false, length); EXPECT_TRUE(success); @@ -38,7 +38,7 @@ class TestLayernorm2d : public ::testing::Test }; using KernelTypes = ::testing::Types< - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> std::tuple>; TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes); diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp index f555b42592a..de4133aa836 100644 --- a/test/normalization/test_layernorm2d_fp32.cpp +++ b/test/normalization/test_layernorm2d_fp32.cpp @@ -12,11 +12,11 @@ template class TestLayernorm2d : public ::testing::Test { protected: - using XDataType = std::tuple_element_t<0, Tuple>; - using GammaDataType = std::tuple_element_t<1, Tuple>; - using BetaDataType = std::tuple_element_t<2, Tuple>; - using AccDataType = std::tuple_element_t<3, Tuple>; - using YDataType = std::tuple_element_t<4, Tuple>; + using XDataType = std::tuple_element_t<0, Tuple>; + using GammaDataType = std::tuple_element_t<1, Tuple>; + using BetaDataType = std::tuple_element_t<2, Tuple>; + using ComputeDataType = std::tuple_element_t<3, Tuple>; + using YDataType = std::tuple_element_t<4, Tuple>; void Run() { @@ -29,7 +29,7 @@ class TestLayernorm2d : public ::testing::Test bool success = ck::profiler::profile_layernorm_impl(true, 2, false, false, length); EXPECT_TRUE(success); @@ -38,7 +38,7 @@ class TestLayernorm2d : public ::testing::Test }; using KernelTypes = ::testing::Types< - // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType> + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType> std::tuple>; TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes); From 3b6f9c167fc041571e4fbe30db2a53306d8dbe5a Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 10 Feb 2023 13:04:10 -0500 Subject: [PATCH 07/19] Remove useless code --- .../gpu/grid/gridwise_normalization_welford_variance.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index 34a142bdf38..0320385bf4a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -122,11 +122,6 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk YDataType* const __restrict__ p_y_global, const YElementwiseOperation y_elementwise_op) { - if constexpr(SweepOnce) - { - num_k_block_tile_iteration = 1; - } - auto y_global_val_buf = make_dynamic_buffer( p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); From 8745c0ca8e1894ad571ab01a06bd164364ac71d8 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 10 Feb 2023 13:07:03 -0500 Subject: [PATCH 08/19] Update naive variance kernel --- .../gridwise_normalization_naive_variance.hpp | 446 ++++++++++++------ 1 file changed, 291 insertions(+), 155 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp index 89efea4d6c3..9ce8be3e131 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp @@ -4,9 +4,8 @@ #pragma once #include "ck/utility/data_type.hpp" -#include "ck/utility/reduction_common.hpp" + #include "ck/utility/reduction_operator.hpp" -#include "ck/utility/reduction_functions_accumulate.hpp" #include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" #include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" @@ -19,8 +18,8 @@ template ; @@ -59,19 +62,23 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk static constexpr auto thread_cluster_desc = make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + using ThreadBufferLengths_M_K = Sequence; + static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}))); + make_tuple(Number{}, Number{}))); using ThreadReduceDstDesc_M = decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); - using BlockwiseSumReduce = PartitionedBlockwiseReduction; - using ThreadwiseSumReduce = ThreadwiseReduction{}; static constexpr auto I2 = Number<2>{}; - static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; - static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize; + + static constexpr auto ThreadBufferNumber = Number{}; __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k, const GridDesc_M_K& gamma_grid_desc_m_k, const GridDesc_M_K& beta_grid_desc_m_k, const GridDesc_M_K& y_grid_desc_m_k, index_t num_k_block_tile_iteration, - AccDataType epsilon, + ComputeDataType epsilon, const XDataType* const __restrict__ p_x_global, const GammaDataType* const __restrict__ p_gamma_global, const BetaDataType* const __restrict__ p_beta_global, YDataType* const __restrict__ p_y_global, - const AccElementwiseOperation acc_elementwise_op) + const YElementwiseOperation y_elementwise_op) { - if constexpr(SweepOnce) - { - num_k_block_tile_iteration = 1; - } - // LDS - __shared__ AccDataType p_reduce_work_buffer[BlockSize]; - - auto y_global_val_buf = make_dynamic_buffer( - p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); + __shared__ ComputeDataType p_reduce_work_buffer[BlockSize]; auto reduce_work_buf = make_dynamic_buffer(p_reduce_work_buffer, BlockSize); - StaticBuffer - x_thread_buf; - - StaticBuffer - gamma_thread_buf; - - StaticBuffer& beta_thread_buf = gamma_thread_buf; - - StaticBuffer - y_thread_buf; - - StaticBuffer& x_square_thread_buf = y_thread_buf; + auto y_global_val_buf = make_dynamic_buffer( + p_y_global, y_grid_desc_m_k.GetElementSpaceSize()); - StaticBuffer mean_thread_buf; - StaticBuffer + auto x_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto gamma_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto beta_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto y_thread_buf = generate_tuple( + [&](auto) { + return StaticBuffer{}; + }, + Number{}); + + auto& x_square_thread_buf = y_thread_buf; + + StaticBuffer + mean_thread_buf; + StaticBuffer mean_square_thread_buf; - StaticBuffer& var_thread_buf = - mean_square_thread_buf; - - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - mean_thread_buf(I) = reduce::Add::template GetIdentityValue(); - mean_square_thread_buf(I) = reduce::Add::template GetIdentityValue(); - }); + StaticBuffer& + var_thread_buf = mean_square_thread_buf; const index_t thread_local_id = get_thread_local_1d_id(); const index_t block_global_id = get_block_1d_id(); @@ -149,12 +169,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk const auto thread_m_cluster_id = thread_cluster_idx[I0]; const auto thread_k_cluster_id = thread_cluster_idx[I1]; - using ThreadBufferLengths_M_K = Sequence; - constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2{}([&](auto I) { + mean_thread_buf(I) = reduce::Add::template GetIdentityValue(); + mean_square_thread_buf(I) = reduce::Add::template GetIdentityValue(); + }); - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - x_square_thread_buf(Number{}) = - x_thread_buf(Number{}) * x_thread_buf(Number{}); + // Separate sweep once and sweep twice pipeline + if constexpr(SweepOnce) + { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + + threadwise_gamma_load.Run(gamma_grid_desc_m_k, + gamma_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + gamma_thread_buf(i)); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + x_square_thread_buf(i)(Number{}) = + x_thread_buf(i)(Number{}) * + x_thread_buf(i)(Number{}); + }); }); - }); - ThreadwiseSumReduce::Reduce(x_thread_buf, mean_thread_buf); - ThreadwiseSumReduce::Reduce(x_square_thread_buf, mean_square_thread_buf); + ThreadwiseSumReduce::Reduce(x_thread_buf[i], mean_thread_buf); + ThreadwiseSumReduce::Reduce(x_square_thread_buf[i], mean_square_thread_buf); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); - - ++reducedTiles; - } while(reducedTiles < num_k_block_tile_iteration); + if constexpr(i != ThreadBufferNumber - 1) + { + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + thread_copy_fwd_step_m_k); + } + }); - static_for<0, MThreadSliceSize, 1>{}([&](auto I) { - if constexpr(I > 0) - block_sync_lds(); + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); - BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I)); - mean_thread_buf(I) = mean_thread_buf(I) / reduce_length; + BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I)); + mean_thread_buf(I) = mean_thread_buf(I) / reduce_length; - block_sync_lds(); + block_sync_lds(); - BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I)); - mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length; + BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I)); + mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length; - // var(x) = E[x^2] - E[x]^2 - var_thread_buf(I) = - mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I)); - }); + // var(x) = E[x^2] - E[x]^2 + var_thread_buf(I) = + mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I)); + }); - // y = (x - E[x]) / sqrt(var[x] + epsilon) - auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k; + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(iK0)); + if constexpr(iK0 != ThreadBufferNumber - 1) + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // normalize + y_thread_buf(iK0)(Number{}) = + (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * + divisor; + + // gamma & beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) * + gamma_thread_buf(iK0)(Number{}) + + beta_thread_buf(iK0)(Number{}); + }); + }); + }); - threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_y_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + y_thread_buf(i), + y_grid_desc_m_k, + y_global_val_buf); - reducedTiles = 0; - do + if constexpr(i != ThreadBufferNumber - 1) + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + } // end of sweep once + else { - if constexpr(!SweepOnce) + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) { - threadwise_x_load.Run(x_grid_desc_m_k, - x_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - x_thread_buf); + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + x_square_thread_buf(i)(Number{}) = + x_thread_buf(i)(Number{}) * + x_thread_buf(i)(Number{}); + }); + }); + + ThreadwiseSumReduce::Reduce(x_thread_buf[i], mean_thread_buf); + ThreadwiseSumReduce::Reduce(x_square_thread_buf[i], mean_square_thread_buf); + }); } - threadwise_gamma_load.Run(gamma_grid_desc_m_k, - gamma_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - gamma_thread_buf); + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); - - // normalize - y_thread_buf(Number{}) = - (x_thread_buf(Number{}) - mean_thread_buf(iM)) / - sqrt(var_thread_buf(iM) + epsilon); - - // gamma - y_thread_buf(Number{}) = - y_thread_buf(Number{}) * gamma_thread_buf(Number{}); - }); - }); + BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I)); + mean_thread_buf(I) = mean_thread_buf(I) / reduce_length; - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf); + block_sync_lds(); - static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { - static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { - constexpr auto offset_m_k = - thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I)); + mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length; - // beta - y_thread_buf(Number{}) = - y_thread_buf(Number{}) + beta_thread_buf(Number{}); - }); + // var(x) = E[x^2] - E[x]^2 + var_thread_buf(I) = + mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I)); }); - threadwise_y_store.Run(thread_buffer_desc_m_k, - make_tuple(I0, I0), - y_thread_buf, - y_grid_desc_m_k, - y_global_val_buf); + auto thread_copy_tail_m_k = + (num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k; threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_bwd_step_m_k); - threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf(i)); + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + }); - ++reducedTiles; - } while(reducedTiles < num_k_block_tile_iteration); + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_gamma_load.Run(gamma_grid_desc_m_k, + gamma_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + gamma_thread_buf(i)); + + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // normalize + y_thread_buf(iK0)(Number{}) = + (x_thread_buf(iK0)(Number{}) - mean_thread_buf(iM)) * + divisor; + + // gamma + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) * + gamma_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(i)); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) + + beta_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_y_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + y_thread_buf(i), + y_grid_desc_m_k, + y_global_val_buf); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k); + threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, + 2 * thread_copy_bwd_step_m_k); + } + } // end of sweep twice } }; From 510dfb603e11a217fc49baf903312b092db68581 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 10 Feb 2023 14:22:49 -0500 Subject: [PATCH 09/19] Refine string --- profiler/include/profiler/profile_layernorm_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp index ad7a352e290..6c16fe4f876 100644 --- a/profiler/include/profiler/profile_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_impl.hpp @@ -182,7 +182,7 @@ bool profile_layernorm_impl(int do_verification, y_dev.FromDevice(y.mData.data()); bool pass = ck::utils::check_err( - y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); if(do_log) { From f174fb096d9525fce3950af4284199ed549181f3 Mon Sep 17 00:00:00 2001 From: rocking Date: Fri, 10 Feb 2023 14:23:36 -0500 Subject: [PATCH 10/19] Fix typo --- example/27_layernorm/layernorm_blockwise.cpp | 6 +++--- example/42_groupnorm/groupnorm_sigmoid_fp16.cpp | 6 +++--- .../ck/tensor_operation/gpu/device/device_normalization.hpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp index 87c20eb172d..7d91b69d047 100644 --- a/example/27_layernorm/layernorm_blockwise.cpp +++ b/example/27_layernorm/layernorm_blockwise.cpp @@ -24,7 +24,7 @@ using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; using YDataType = ck::half_t; -using ConputeDataType = float; +using ComputeDataType = float; using PassThrough = ck::tensor_operation::element_wise::PassThrough; constexpr int Rank = 2; @@ -34,7 +34,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationImpl; diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp index d283c656607..35c7c054e05 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -27,7 +27,7 @@ using XDataType = ck::half_t; using GammaDataType = ck::half_t; using BetaDataType = ck::half_t; using YDataType = ck::half_t; -using ConputeDataType = float; +using ComputeDataType = float; struct YElementOp { @@ -50,7 +50,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationImpl; ReferenceInstance ref; diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp index 4739cca3dbb..03601ce8312 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp @@ -14,7 +14,7 @@ namespace device { template Date: Fri, 10 Feb 2023 14:25:13 -0500 Subject: [PATCH 11/19] Support naive variance for device_normalization --- .../device/impl/device_normalization_impl.hpp | 131 +++--------- .../grid/gridwise_normalization_selector.hpp | 195 ++++++++++++++++++ 2 files changed, 223 insertions(+), 103 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp index 0353051021d..c80d7879f8c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp @@ -10,46 +10,11 @@ #include "ck/tensor_operation/gpu/device/device_normalization.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" -namespace ck { -template -__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k, - const GridDesc_M_K gamma_grid_desc_m_k, - const GridDesc_M_K beta_grid_desc_m_k, - const GridDesc_M_K y_grid_desc_m_k, - index_t num_k_block_tile_iteration, - ConputeDataType epsilon, - const XDataType* const __restrict__ p_x_global, - const GammaDataType* const __restrict__ p_gamma_global, - const BetaDataType* const __restrict__ p_beta_global, - YDataType* const __restrict__ p_y_global, - const YElementwiseOperation y_elementwise_op) -{ - GridwiseReduction::Run(x_grid_desc_m_k, - gamma_grid_desc_m_k, - beta_grid_desc_m_k, - y_grid_desc_m_k, - num_k_block_tile_iteration, - epsilon, - p_x_global, - p_gamma_global, - p_beta_global, - p_y_global, - y_elementwise_op); -}; -} // namespace ck - namespace ck { namespace tensor_operation { namespace device { @@ -58,7 +23,7 @@ namespace device { template + index_t YDstVectorSize, + bool UseWelford = true> struct DeviceNormalizationImpl : public DeviceNormalization; - using GridwiseNormalizationSweepOnce = - GridwiseNormalizationWelfordVariance_mk_to_mk; - struct Argument : public BaseArgument { Argument(const std::vector lengths, @@ -232,7 +153,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization(epsilon); + epsilon_ = static_cast(epsilon); Lengths_ = shuffle_tensor_dimensions(lengths, reduceDims); xStrides_ = shuffle_tensor_dimensions(xStrides, reduceDims); @@ -265,7 +186,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization{}) <= KThreadClusterSize * KThreadSliceSize; } - ConputeDataType epsilon_; + ComputeDataType epsilon_; const XDataType* p_x_; const GammaDataType* p_gamma_; @@ -295,23 +216,27 @@ struct DeviceNormalizationImpl : public DeviceNormalization - : kernel_normalization; + auto kernel_main = NormalizationKernelSelector(arg.isSweeponce_); float avg_time = 0; avg_time += launch_and_time_kernel(stream_config, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp new file mode 100644 index 00000000000..37795fa5694 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp" + +namespace ck { +template +__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k, + const GridDesc_M_K gamma_grid_desc_m_k, + const GridDesc_M_K beta_grid_desc_m_k, + const GridDesc_M_K y_grid_desc_m_k, + index_t num_k_block_tile_iteration, + ComputeDataType epsilon, + const XDataType* const __restrict__ p_x_global, + const GammaDataType* const __restrict__ p_gamma_global, + const BetaDataType* const __restrict__ p_beta_global, + YDataType* const __restrict__ p_y_global, + const YElementwiseOperation y_elementwise_op) +{ + GridwiseReduction::Run(x_grid_desc_m_k, + gamma_grid_desc_m_k, + beta_grid_desc_m_k, + y_grid_desc_m_k, + num_k_block_tile_iteration, + epsilon, + p_x_global, + p_gamma_global, + p_beta_global, + p_y_global, + y_elementwise_op); +}; + +template +auto NormalizationKernelSelector(bool isSweepOnce) +{ + using GridwiseNormalizationGenericNaive = + GridwiseNormalizationNaiveVariance_mk_to_mk; + using GridwiseNormalizationSweepOnceNaive = + GridwiseNormalizationNaiveVariance_mk_to_mk; + using GridwiseNormalizationGenericWelford = + GridwiseNormalizationWelfordVariance_mk_to_mk; + using GridwiseNormalizationSweepOnceWelford = + GridwiseNormalizationWelfordVariance_mk_to_mk; + + if constexpr(UseWelford) + { + return isSweepOnce ? kernel_normalization + : kernel_normalization; + } + else + { + return isSweepOnce ? kernel_normalization + : kernel_normalization; + } +} + +} // namespace ck From 491c063142d43121b40490f82b266190344ca163 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 13 Feb 2023 11:46:34 -0500 Subject: [PATCH 12/19] Check the blocksize --- .../gpu/device/impl/device_normalization_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp index c80d7879f8c..bb62332d1ad 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp @@ -50,6 +50,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization { + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize); static_assert( ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) || (GammaSrcVectorDim == 1 && KThreadSliceSize % GammaSrcVectorSize == 0)), From f61c37d052dc3e79600891d386a46c11f40314a8 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 13 Feb 2023 11:48:11 -0500 Subject: [PATCH 13/19] Share the VGPR of x and y --- .../gpu/grid/gridwise_normalization_welford_variance.hpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index 0320385bf4a..8c0f4b359d6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -152,14 +152,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }, Number{}); - auto y_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); + auto& y_thread_buf = x_thread_buf; StaticBuffer mean_thread_buf; From 8b1f22383430af4d33784dccd9a3a5cc7a30d9f8 Mon Sep 17 00:00:00 2001 From: rocking Date: Mon, 13 Feb 2023 12:04:09 -0500 Subject: [PATCH 14/19] Share the VGPR of gamma and beta --- .../gridwise_normalization_naive_variance.hpp | 45 ++++++++++-------- ...ridwise_normalization_welford_variance.hpp | 46 +++++++++++-------- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp index 9ce8be3e131..792ffabcb90 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp @@ -133,14 +133,7 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk }, Number{}); - auto beta_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); + auto& beta_thread_buf = gamma_thread_buf; auto y_thread_buf = generate_tuple( [&](auto) { @@ -314,15 +307,6 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf(iK0)); - if constexpr(iK0 != ThreadBufferNumber - 1) - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - thread_copy_fwd_step_m_k); - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); @@ -335,7 +319,32 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk // gamma & beta y_thread_buf(iK0)(Number{}) = y_thread_buf(iK0)(Number{}) * - gamma_thread_buf(iK0)(Number{}) + + gamma_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(i)); + + if constexpr(i != ThreadBufferNumber - 1) + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) + beta_thread_buf(iK0)(Number{}); }); }); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index 8c0f4b359d6..ff83e105044 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -143,15 +143,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk }, Number{}); - auto beta_thread_buf = generate_tuple( - [&](auto) { - return StaticBuffer{}; - }, - Number{}); - + auto& beta_thread_buf = gamma_thread_buf; auto& y_thread_buf = x_thread_buf; StaticBuffer @@ -292,15 +284,6 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon); static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { - threadwise_beta_load.Run(beta_grid_desc_m_k, - beta_global_val_buf, - thread_buffer_desc_m_k, - make_tuple(I0, I0), - beta_thread_buf(iK0)); - if constexpr(iK0 != ThreadBufferNumber - 1) - threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, - thread_copy_fwd_step_m_k); - static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { constexpr auto offset_m_k = thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); @@ -313,7 +296,32 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk // gamma & beta y_thread_buf(iK0)(Number{}) = y_thread_buf(iK0)(Number{}) * - gamma_thread_buf(iK0)(Number{}) + + gamma_thread_buf(iK0)(Number{}); + }); + }); + }); + + static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { + threadwise_beta_load.Run(beta_grid_desc_m_k, + beta_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + beta_thread_buf(i)); + + if constexpr(i != ThreadBufferNumber - 1) + threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, + thread_copy_fwd_step_m_k); + }); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) { + static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) { + constexpr auto offset_m_k = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1)); + + // beta + y_thread_buf(iK0)(Number{}) = + y_thread_buf(iK0)(Number{}) + beta_thread_buf(iK0)(Number{}); }); }); From ff5b6ea1766ab4d38b9f011104053c58351bc0f3 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 14 Feb 2023 08:03:14 -0500 Subject: [PATCH 15/19] Add more instances --- .../gpu/normalization/device_normalization_f16_instance.cpp | 6 +++--- .../gpu/normalization/device_normalization_f32_instance.cpp | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp index b24ee9f9f44..7923c9a1c86 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -28,13 +28,13 @@ using device_normalization_f16_instances = DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, + DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, @@ -54,14 +54,14 @@ void add_device_normalization_rank_4_3_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_instances{}); + // add_device_operation_instances(instances, device_normalization_f16_instances{}); } void add_device_normalization_rank_5_3_f16_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_normalization_f16_instances{}); + // add_device_operation_instances(instances, device_normalization_f16_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp index 188f8046a4a..8b7404966c3 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -32,9 +32,11 @@ using device_layernorm_f32_instances = std::tuple< DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, + DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl, + DeviceNormalizationImpl, DeviceNormalizationImpl, DeviceNormalizationImpl // clang-format on @@ -51,14 +53,14 @@ void add_device_normalization_rank_4_3_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f32_instances{}); + // add_device_operation_instances(instances, device_layernorm_f32_instances{}); } void add_device_normalization_rank_5_3_f32_instances( std::vector>>& instances) { - add_device_operation_instances(instances, device_layernorm_f32_instances{}); + // add_device_operation_instances(instances, device_layernorm_f32_instances{}); } } // namespace instance From 7460bab1e0df4e422b4d90fd644b9f78e0843ee7 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 14 Feb 2023 08:03:42 -0500 Subject: [PATCH 16/19] Support fp16 sqrt for experiment --- include/ck/utility/math_v2.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index 4aba0b11926..4febace0b84 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -83,6 +83,11 @@ static inline __host__ bool isnan(int4_t x) }; #endif +static inline __host__ half_t sqrt(half_t x) +{ + return static_cast(std::sqrt(static_cast(x))); +}; + static inline __host__ float sqrt(float x) { return std::sqrt(x); }; static inline __host__ double sqrt(double x) { return std::sqrt(x); }; @@ -158,6 +163,11 @@ static inline __device__ bool isnan(half_t x) return (xx & 0x7FFF) > 0x7C00; }; +static inline __device__ half_t sqrt(half_t x) +{ + return static_cast(__builtin_amdgcn_sqrtf(static_cast(x))); +}; + static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); }; static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); }; From aaee60c3556e623336d7a3fd0514c754ef24834f Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 14 Feb 2023 08:06:51 -0500 Subject: [PATCH 17/19] Add CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c3215ae44f..23e7fb6274b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ Full documentation for Composable Kernel is not yet available. - Fixed grouped ConvBwdWeight test case failure (#524). ### Optimizations -- Optimized ... +- Improve proformance of normalization kernel ### Added - Added user tutorial (#563). From 6df536729b578a4c07624b027222b0ac8515d4b6 Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 14 Feb 2023 08:20:07 -0500 Subject: [PATCH 18/19] Fix typo --- .../gpu/normalization/device_normalization_f16_instance.cpp | 4 ++-- .../gpu/normalization/device_normalization_f32_instance.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp index 7923c9a1c86..beeaa3aa22d 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp @@ -54,14 +54,14 @@ void add_device_normalization_rank_4_3_f16_instances( std::vector>>& instances) { - // add_device_operation_instances(instances, device_normalization_f16_instances{}); + add_device_operation_instances(instances, device_normalization_f16_instances{}); } void add_device_normalization_rank_5_3_f16_instances( std::vector>>& instances) { - // add_device_operation_instances(instances, device_normalization_f16_instances{}); + add_device_operation_instances(instances, device_normalization_f16_instances{}); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp index 8b7404966c3..4d236fb6332 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp @@ -53,14 +53,14 @@ void add_device_normalization_rank_4_3_f32_instances( std::vector>>& instances) { - // add_device_operation_instances(instances, device_layernorm_f32_instances{}); + add_device_operation_instances(instances, device_layernorm_f32_instances{}); } void add_device_normalization_rank_5_3_f32_instances( std::vector>>& instances) { - // add_device_operation_instances(instances, device_layernorm_f32_instances{}); + add_device_operation_instances(instances, device_layernorm_f32_instances{}); } } // namespace instance From 98562925a131be66da6b9b6e367685073243f61c Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 14 Feb 2023 15:08:59 -0500 Subject: [PATCH 19/19] clang-format --- .../gpu/grid/gridwise_normalization_welford_variance.hpp | 2 +- profiler/include/profiler/profile_layernorm_impl.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp index ff83e105044..3a7ae459e5f 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp @@ -144,7 +144,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk Number{}); auto& beta_thread_buf = gamma_thread_buf; - auto& y_thread_buf = x_thread_buf; + auto& y_thread_buf = x_thread_buf; StaticBuffer mean_thread_buf; diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp index 6c16fe4f876..7dd90d07977 100644 --- a/profiler/include/profiler/profile_layernorm_impl.hpp +++ b/profiler/include/profiler/profile_layernorm_impl.hpp @@ -181,8 +181,8 @@ bool profile_layernorm_impl(int do_verification, { y_dev.FromDevice(y.mData.data()); - bool pass = ck::utils::check_err( - y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + bool pass = + ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); if(do_log) {