diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/example/13_pool2d_fwd/CMakeLists.txt
index 1fdeb4c5858..db09c03321e 100644
--- a/example/13_pool2d_fwd/CMakeLists.txt
+++ b/example/13_pool2d_fwd/CMakeLists.txt
@@ -1 +1,3 @@
-add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
+add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
+add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)
+
diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md
index 2314cfd6701..9b017734e92 100644
--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
@@ -1,12 +1,12 @@
-# Instructions for ```example_pool2d_fwd``` Example
+# Instructions for ```example_pool2d_fwd``` Examples
 
-## Run ```example_pool2d_fwd```
+## Run ```example_pool2d_fwd_fp16```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg3: time kernel (0=no, 1=yes)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
-./bin/example_pool2d_fwd 1 1 1
+./bin/example_pool2d_fwd_fp16 1 1 1
 ```
 
 Result 
@@ -18,3 +18,24 @@ Warm up 1 time
 Start running 10 times...
 Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
 ```
+
+## Run ```example_pool2d_fwd_fp32```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./bin/example_pool2d_fwd_fp32 1 1 1
+```
+
+
+Result 
+```
+./bin/example_pool2d_fwd_fp32 1 1 1
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
+```
diff --git a/example/13_pool2d_fwd/pool2d_fwd.cpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
similarity index 76%
rename from example/13_pool2d_fwd/pool2d_fwd.cpp
rename to example/13_pool2d_fwd/pool2d_fwd_common.hpp
index 662a48500f5..8327478fd57 100644
--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -1,8 +1,4 @@
 #include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
 
 #include "check_err.hpp"
 #include "config.hpp"
@@ -13,44 +9,13 @@
 #include "host_reduce_util.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "reduction_operator.hpp"
+#include "reduction_enums.hpp"
 #include "device_pool2d_fwd_nhwc_nhwc.hpp"
 
-using InDataType  = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-
-using IndexDataType = int32_t;
-
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using OutLayout = ck::tensor_layout::convolution::NHWC;
-
-#if 1
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
-#else
-static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
-#endif
-
-static constexpr bool OutputIndex  = false;
-static constexpr bool PropagateNan = false;
-
-using DevicePoolFwdInstance =
-    ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
-        InDataType,  // InDataType
-        OutDataType, // OutDataType
-        AccDataType, // AccDataType
-        ReduceOpId,
-        OutputIndex,
-        64, // BlockSize
-        64, // ReduceMThreadClusterSize
-        1,  // ReduceKThreadClusterSize
-        4,  // ReduceMThreadSliceSize
-        1,  // ReduceKThreadSliceSize
-        4>; // InSrcOutDstVectorSize
-
 template <typename InDataType,
           typename OutDataType,
           typename AccDataType,
+          typename IndexDataType,
           ck::ReduceTensorOp ReduceOpId,
           bool PropagateNan,
           bool OutputIndex>
@@ -147,68 +112,46 @@ static void pool_host_verify(const Tensor<InDataType>& in,
     };
 }
 
-int main(int argc, char* argv[])
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool pool_test(bool do_verification,
+               int init_method,
+               bool time_kernel,
+               ck::index_t N,
+               ck::index_t C,
+               ck::index_t Y,
+               ck::index_t X,
+               ck::index_t Hi,
+               ck::index_t Wi,
+               ck::index_t window_stride_h,
+               ck::index_t window_stride_w,
+               ck::index_t in_left_pad_h,
+               ck::index_t in_left_pad_w,
+               ck::index_t in_right_pad_h,
+               ck::index_t in_right_pad_w)
 {
     using namespace ck::host_reduce;
 
-    bool do_verification;
-    int init_method;
-    bool time_kernel;
-
-    // Pool shape
-    ck::index_t N               = 128;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t window_stride_h = 2;
-    ck::index_t window_stride_w = 2;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-
-    if(argc == 1)
-    {
-        do_verification = true;
-        init_method     = 1;
-        time_kernel     = true;
-    }
-    else if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
-    }
-    else if(argc == 16)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
-
-        N               = std::stoi(argv[4]);
-        C               = std::stoi(argv[5]);
-        Y               = std::stoi(argv[6]);
-        X               = std::stoi(argv[7]);
-        Hi              = std::stoi(argv[8]);
-        Wi              = std::stoi(argv[9]);
-        window_stride_h = std::stoi(argv[10]);
-        window_stride_w = std::stoi(argv[11]);
-        in_left_pad_h   = std::stoi(argv[12]);
-        in_left_pad_w   = std::stoi(argv[13]);
-        in_right_pad_h  = std::stoi(argv[14]);
-        in_right_pad_w  = std::stoi(argv[15]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(0);
-    }
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+            InDataType,  // InDataType
+            OutDataType, // OutDataType
+            AccDataType, // AccDataType
+            ReduceOpId,
+            OutputIndex,
+            64, // BlockSize
+            64, // ReduceMThreadClusterSize
+            1,  // ReduceKThreadClusterSize
+            4,  // ReduceMThreadSliceSize
+            1,  // ReduceKThreadSliceSize
+            4>; // InSrcOutDstVectorSize
 
     const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
     const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
@@ -302,6 +245,7 @@ int main(int argc, char* argv[])
         pool_host_verify<InDataType,
                          OutDataType,
                          AccDataType,
+                         IndexDataType,
                          ReduceOpId,
                          PropagateNan,
                          OutputIndex>(in_n_c_hi_wi,
@@ -325,5 +269,5 @@ int main(int argc, char* argv[])
         };
     }
 
-    return (pass ? 0 : 1);
-}
+    return (pass);
+};
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
new file mode 100644
index 00000000000..624c8ad6cdd
--- /dev/null
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -0,0 +1,116 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "config.hpp"
+#include "tensor_layout.hpp"
+#include "reduction_enums.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    using namespace ck::host_reduce;
+
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N               = 128;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 16)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        window_stride_h = std::stoi(argv[10]);
+        window_stride_w = std::stoi(argv[11]);
+        in_left_pad_h   = std::stoi(argv[12]);
+        in_left_pad_w   = std::stoi(argv[13]);
+        in_right_pad_h  = std::stoi(argv[14]);
+        in_right_pad_w  = std::stoi(argv[15]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          AccDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
new file mode 100644
index 00000000000..d2d2ae05d10
--- /dev/null
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -0,0 +1,116 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "config.hpp"
+#include "tensor_layout.hpp"
+#include "reduction_enums.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType  = float;
+using OutDataType = float;
+using AccDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    using namespace ck::host_reduce;
+
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N               = 128;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 16)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        window_stride_h = std::stoi(argv[10]);
+        window_stride_w = std::stoi(argv[11]);
+        in_left_pad_h   = std::stoi(argv[12]);
+        in_left_pad_w   = std::stoi(argv[13]);
+        in_right_pad_h  = std::stoi(argv[14]);
+        in_right_pad_w  = std::stoi(argv[15]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          AccDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/21_lnorm/CMakeLists.txt b/example/21_lnorm/CMakeLists.txt
new file mode 100644
index 00000000000..1fc7378d794
--- /dev/null
+++ b/example/21_lnorm/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_lnorm_use_reduce lnorm_use_reduce.cpp)
diff --git a/example/21_lnorm/README.md b/example/21_lnorm/README.md
new file mode 100644
index 00000000000..475f575b45f
--- /dev/null
+++ b/example/21_lnorm/README.md
@@ -0,0 +1,19 @@
+# Instructions for ```example_lnorm_use_reduce```
+
+## Run ```example_lnorm_use_reduce```
+```bash
+# -D <xxx> : input 4-d tensor lengths (nhwc layout)
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes) 
+./bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 1
+```
+
+Result
+```
+./bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 0 
+```
+root@dc-smc-18:/data/work/composable_kernel/Build3# bin/example_lnorm_use_reduce -D 512,28,28,256 -v 1 1 0
+Perf: 0 ms, inf GB/s, DeviceReduceMultiBlockAtomicAdd<256,M_C16_S1,K_C16_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> + DeviceReduceMultiBlockAtomicAdd<256,M_C16_S1,K_C16_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
+
+
diff --git a/example/21_lnorm/lnorm_use_reduce.cpp b/example/21_lnorm/lnorm_use_reduce.cpp
new file mode 100644
index 00000000000..0c7921cf67d
--- /dev/null
+++ b/example/21_lnorm/lnorm_use_reduce.cpp
@@ -0,0 +1,382 @@
+#include <iostream>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_base.hpp"
+#include "device_reduce_multiblock.hpp"
+#include "host_common_util.hpp"
+
+#include "reduction_enums.hpp"
+#include "reduction_operator_mapping.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType  = ck::half_t;
+using OutDataType = float;
+using AccDataType = float;
+
+// for NHWC layer-norm calculation of mean and meansquare
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+constexpr bool PropagateNan = false;
+constexpr bool OutputIndex  = false;
+
+constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::AtomicAdd;
+
+using ReduceOperation_Mean = ck::reduce::Add<AccDataType>;
+using InElementwiseOperation_Mean =
+    ck::tensor_operation::element_wise::UnaryIdentic<AccDataType, AccDataType, false>;
+using AccElementwiseOperation_Mean =
+    ck::tensor_operation::element_wise::UnaryIdentic<AccDataType, AccDataType, true>;
+
+using ReduceOperation_Meansquare = ck::reduce::Add<AccDataType>;
+using InElementwiseOperation_Meansquare =
+    ck::tensor_operation::element_wise::UnarySquare<AccDataType, AccDataType, false>;
+using AccElementwiseOperation_Meansquare =
+    ck::tensor_operation::element_wise::UnaryIdentic<AccDataType, AccDataType, true>;
+
+using DeviceReduceInstance_Mean = DeviceReduceMultiBlock<InDataType,
+                                                         AccDataType,
+                                                         OutDataType,
+                                                         Rank,
+                                                         NumReduceDim,
+                                                         ReduceOperation_Mean,
+                                                         InElementwiseOperation_Mean,
+                                                         AccElementwiseOperation_Mean,
+                                                         OutMemoryDataOperation,
+                                                         PropagateNan,
+                                                         OutputIndex,
+                                                         false, // HaveIndexInputIfOutputIndex
+                                                         256,
+                                                         16,
+                                                         16,
+                                                         1,
+                                                         1,
+                                                         1, // InSrcVectorDim
+                                                         1,
+                                                         1>;
+
+using DeviceReduceInstance_Meansquare = DeviceReduceMultiBlock<InDataType,
+                                                               AccDataType,
+                                                               OutDataType,
+                                                               Rank,
+                                                               NumReduceDim,
+                                                               ReduceOperation_Meansquare,
+                                                               InElementwiseOperation_Meansquare,
+                                                               AccElementwiseOperation_Meansquare,
+                                                               OutMemoryDataOperation,
+                                                               PropagateNan,
+                                                               OutputIndex,
+                                                               false, // HaveIndexInputIfOutputIndex
+                                                               256,
+                                                               16,
+                                                               16,
+                                                               1,
+                                                               1,
+                                                               1, // InSrcVectorDim
+                                                               1,
+                                                               1>;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {512, 28, 28, 256};
+    size_t n, h, w, c;
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    public:
+    SimpleAppArgs()
+    {
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+    };
+
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                if(inLengths.size() != Rank)
+                    throw std::runtime_error(
+                        "Invalid option format! The number of integers is incorrect!");
+
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+
+        return (0);
+    };
+};
+
+template <typename InDataType, typename OutDataType, typename AccDataType>
+static void mean_meansquare_host(const Tensor<InDataType>& in,
+                                 Tensor<OutDataType>& mean_ref,
+                                 Tensor<OutDataType>& meansquare_ref,
+                                 size_t n,
+                                 size_t h,
+                                 size_t w,
+                                 size_t c)
+
+{
+    auto thread_reduce_func = [&](auto iN) {
+        AccDataType mean       = type_convert<AccDataType>(0.0f);
+        AccDataType meansquare = type_convert<AccDataType>(0.0f);
+
+        // compute mean, meanquare, variance, invVariance
+        for(std::size_t iH = 0; iH < h; iH++)
+        {
+            for(std::size_t iW = 0; iW < w; iW++)
+            {
+                for(std::size_t iC = 0; iC < c; iC++)
+                {
+                    AccDataType curr_value = type_convert<AccDataType>(in(iN, iH, iW, iC));
+
+                    mean += curr_value;
+                    meansquare += curr_value * curr_value;
+                };
+            }
+        };
+
+        mean       = mean / (h * w * c);
+        meansquare = meansquare / (h * w * c);
+
+        mean_ref(iN)       = mean;
+        meansquare_ref(iN) = meansquare;
+    };
+
+    std::size_t num_thread      = std::thread::hardware_concurrency();
+    std::size_t work_per_thread = (n + num_thread - 1) / num_thread;
+
+    std::vector<joinable_thread> threads(num_thread);
+
+    for(std::size_t it = 0; it < num_thread; it++)
+    {
+        std::size_t iN_begin = it * work_per_thread;
+        std::size_t iN_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread), n);
+
+        auto f = [=] {
+            for(std::size_t iN = iN_begin; iN < iN_end; iN++)
+            {
+                thread_reduce_func(iN);
+            }
+        };
+
+        threads[it] = joinable_thread(f);
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    // layer-norm calculates mean and meansquare by reducing [N, H, W, C] to [N]
+    const std::vector<int> reduceDims{1, 2, 3};
+    const std::vector<int> invariantDims{0};
+
+    SimpleAppArgs args;
+
+    if(argc > 1)
+    {
+        if(args.processArgs(argc, argv) < 0)
+            return (-1);
+    };
+
+    Tensor<InDataType> in(args.inLengths);
+
+    std::vector<size_t> outLengths{args.n};
+
+    Tensor<OutDataType> mean_ref(outLengths);
+    Tensor<OutDataType> mean(outLengths);
+    Tensor<OutDataType> meansquare_ref(outLengths);
+    Tensor<OutDataType> meansquare(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = mean.mDesc.GetStrides();
+
+    size_t invariant_total_length = args.n;
+    size_t reduce_total_length    = args.h * args.w * args.c;
+
+    const float alpha = 1.0f;
+    const float beta  = 0.0f;
+
+    std::size_t num_thread = 1;
+
+    if(args.do_verification)
+    {
+        switch(args.init_method)
+        {
+        case 0: break;
+        case 1: in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread); break;
+        case 2: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread); break;
+        default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
+    DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpace());
+    DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpace());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(args.do_verification)
+    {
+        mean_meansquare_host<InDataType, AccDataType, OutDataType>(
+            in, mean_ref, meansquare_ref, args.n, args.h, args.w, args.c);
+    };
+
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+    std::vector<ck::index_t> i_outLengths;
+    std::vector<ck::index_t> i_outStrides;
+
+    i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+    i_outLengths.assign(outLengths.begin(), outLengths.end());
+    i_outStrides.assign(outStrides.begin(), outStrides.end());
+
+    auto reduce_1 = DeviceReduceInstance_Mean{};
+    auto reduce_2 = DeviceReduceInstance_Meansquare{};
+
+    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        i_outStrides,
+        reduceDims,
+        alpha,
+        beta,
+        in_dev.GetDeviceBuffer(),
+        nullptr,
+        mean_dev.GetDeviceBuffer(),
+        nullptr,
+        InElementwiseOperation_Mean{},
+        AccElementwiseOperation_Mean{static_cast<int32_t>(reduce_total_length)});
+
+    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        i_outStrides,
+        reduceDims,
+        alpha,
+        beta,
+        in_dev.GetDeviceBuffer(),
+        nullptr,
+        meansquare_dev.GetDeviceBuffer(),
+        nullptr,
+        InElementwiseOperation_Meansquare{},
+        AccElementwiseOperation_Meansquare{static_cast<int32_t>(reduce_total_length)});
+
+    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()) ||
+       !reduce_2.IsSupportedArgument(argument_ptr_2.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return (-1);
+    };
+
+    std::string reduce_name_1 = reduce_1.GetTypeString();
+    std::string reduce_name_2 = reduce_2.GetTypeString();
+
+    auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
+    auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
+
+    float avg_time = 0.0f;
+
+    avg_time += invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, args.time_kernel});
+    avg_time += invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, args.time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                            invariant_total_length * sizeof(OutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name_1
+              << " + " << reduce_name_2 << std::endl;
+
+    bool pass = true;
+
+    if(args.do_verification)
+    {
+        mean_dev.FromDevice(mean.mData.data());
+        meansquare_dev.FromDevice(meansquare.mData.data());
+        pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData);
+        pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index e595ca23333..8786dedc000 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -54,3 +54,4 @@ add_subdirectory(16_gemm_reduce)
 add_subdirectory(18_batched_gemm_reduce)
 add_subdirectory(19_binary_elementwise)
 add_subdirectory(20_convnd_bwd_weight_xdl)
+add_subdirectory(21_lnorm)
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 5e81c6a469b..0ad78423fe5 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -325,7 +325,7 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-                atomic_add(c_style_pointer_cast<X*>(&p_data_[i]), x);
+                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
             }
         }
     }
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
index e7a8db8c011..84320f7851e 100644
--- a/include/ck/utility/reduction_operator.hpp
+++ b/include/ck/utility/reduction_operator.hpp
@@ -46,6 +46,8 @@ namespace reduce {
 // operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() --
 // the first argument of the operator must be both an input & output, and the corresponding variable
 // usually stores
+// 3) operator() -- the first argument of the operator must be both an input & output, and the
+// corresponding variable usually stores
 //                  the accumulated result of many operator() calls; the second argument is only an
 //                  input. For indexable binary
 //                  operator, the second version of operator() has third argument (which is an