diff --git a/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp b/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp
index 4a8311d1b97..0e8c25fc4a3 100644
--- a/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp
+++ b/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -34,13 +34,13 @@
 template <typename T>
 __device__ T miopenAdd(T a, T b)
 {
-    return a + b;
+    return (a + b);
 }
 
 template <typename T>
 __device__ T miopenMul(T a, T b)
 {
-    return a * b;
+    return (a * b);
 }
 
 template <typename T>
@@ -97,7 +97,6 @@ extern "C" __global__ void Op1dTensorGeneric(const MIOPEN_TYPE* a,
         c_ptr += c_step;
     }
 }
-
 #endif
 
 #ifdef USE_2D_TENSOR_GENERIC
@@ -154,7 +153,199 @@ extern "C" __global__ void Op2dTensorGeneric(const MIOPEN_TYPE* a,
         gid += step;
     }
 }
+#endif
+
+#ifdef USE_2D_TENSOR_SQUASH
+extern "C" __global__ void Op2dTensorSquash(const MIOPEN_TYPE* a,
+                                            const MIOPEN_TYPE* b,
+                                            const int b_c,
+                                            const int b_nstride,
+                                            MIOPEN_TYPE* c,
+                                            const MIOPEN_TYPE alpha0,
+                                            const MIOPEN_TYPE alpha1,
+                                            const MIOPEN_TYPE beta,
+                                            const long Aoffset,
+                                            const long Boffset,
+                                            const long Coffset,
+                                            const long total_work,
+                                            const int use_apl0,
+                                            const int use_apl1,
+                                            const int use_bet)
+{
+    MIOPEN_TYPE a_dat[RD_BLCK];
+    MIOPEN_TYPE b_dat1[RD_BLCK];
+    MIOPEN_TYPE b_dat2[RD_BLCK];
+    MIOPEN_TYPE b_dat3[RD_BLCK];
+    MIOPEN_TYPE b_dat4[RD_BLCK];
+    MIOPEN_TYPE b_dat5[RD_BLCK];
+    MIOPEN_TYPE b_dat6[RD_BLCK];
+    MIOPEN_TYPE b_dat7[RD_BLCK];
+    MIOPEN_TYPE b_dat8[RD_BLCK];
+    MIOPEN_TYPE b_dat9[RD_BLCK];
+    MIOPEN_TYPE b_dat10[RD_BLCK];
+    MIOPEN_TYPE b_dat11[RD_BLCK];
+    MIOPEN_TYPE b_dat12[RD_BLCK];
+    MIOPEN_TYPE b_dat13[RD_BLCK];
+    MIOPEN_TYPE b_dat14[RD_BLCK];
+    MIOPEN_TYPE b_dat15[RD_BLCK];
+    MIOPEN_TYPE b_dat16[RD_BLCK];
+    MIOPEN_TYPE c_dat[RD_BLCK];
+    int g_RD_BLCK;
+
+    for(int i = 0; i < RD_BLCK; ++i)
+    {
+        b_dat1[i]  = (MIOPEN_TYPE)0;
+        b_dat2[i]  = (MIOPEN_TYPE)0;
+        b_dat3[i]  = (MIOPEN_TYPE)0;
+        b_dat4[i]  = (MIOPEN_TYPE)0;
+        b_dat5[i]  = (MIOPEN_TYPE)0;
+        b_dat6[i]  = (MIOPEN_TYPE)0;
+        b_dat7[i]  = (MIOPEN_TYPE)0;
+        b_dat8[i]  = (MIOPEN_TYPE)0;
+        b_dat9[i]  = (MIOPEN_TYPE)0;
+        b_dat10[i] = (MIOPEN_TYPE)0;
+        b_dat11[i] = (MIOPEN_TYPE)0;
+        b_dat12[i] = (MIOPEN_TYPE)0;
+        b_dat13[i] = (MIOPEN_TYPE)0;
+        b_dat14[i] = (MIOPEN_TYPE)0;
+        b_dat15[i] = (MIOPEN_TYPE)0;
+        b_dat16[i] = (MIOPEN_TYPE)0;
+    }
+
+    const int gid_        = blockIdx.x * blockDim.x + threadIdx.x;
+    const int global_size = gridDim.x * blockDim.x;
+    for(int gid = gid_; gid < total_work; gid += global_size)
+    {
+        for(int i = 0; i < RD_BLCK; ++i)
+        {
+            a_dat[i] = (MIOPEN_TYPE)0;
+            c_dat[i] = (MIOPEN_TYPE)0;
+        }
+
+        int io_index = gid * RD_BLCK;
+        if(use_apl0 == 1)
+        {
+            *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + io_index));
+            for(int i = 0; i < RD_BLCK; ++i)
+            {
+                a_dat[i] *= alpha0;
+            }
+        }
+
+        if(use_bet == 1)
+        {
+            *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + io_index));
+            for(int i = 0; i < RD_BLCK; ++i)
+            {
+                c_dat[i] *= beta;
+            }
+        }
+
+        g_RD_BLCK = gid * RD_BLCK;
+        if(use_apl1 == 1)
+        {
+            for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
+            {
 
+                int b_index1           = (bid * b_nstride) + g_RD_BLCK;
+                int b_index2           = ((bid + 1) * b_nstride) + g_RD_BLCK;
+                int b_index3           = ((bid + 2) * b_nstride) + g_RD_BLCK;
+                int b_index4           = ((bid + 3) * b_nstride) + g_RD_BLCK;
+                int b_index5           = ((bid + 4) * b_nstride) + g_RD_BLCK;
+                int b_index6           = ((bid + 5) * b_nstride) + g_RD_BLCK;
+                int b_index7           = ((bid + 6) * b_nstride) + g_RD_BLCK;
+                int b_index8           = ((bid + 7) * b_nstride) + g_RD_BLCK;
+                int b_index9           = ((bid + 8) * b_nstride) + g_RD_BLCK;
+                int b_index10          = ((bid + 9) * b_nstride) + g_RD_BLCK;
+                int b_index11          = ((bid + 10) * b_nstride) + g_RD_BLCK;
+                int b_index12          = ((bid + 11) * b_nstride) + g_RD_BLCK;
+                int b_index13          = ((bid + 12) * b_nstride) + g_RD_BLCK;
+                int b_index14          = ((bid + 13) * b_nstride) + g_RD_BLCK;
+                int b_index15          = ((bid + 14) * b_nstride) + g_RD_BLCK;
+                int b_index16          = ((bid + 15) * b_nstride) + g_RD_BLCK;
+                *((READ_TYPE*)b_dat1)  = *((const READ_TYPE*)(b + Boffset + b_index1));
+                *((READ_TYPE*)b_dat2)  = *((const READ_TYPE*)(b + Boffset + b_index2));
+                *((READ_TYPE*)b_dat3)  = *((const READ_TYPE*)(b + Boffset + b_index3));
+                *((READ_TYPE*)b_dat4)  = *((const READ_TYPE*)(b + Boffset + b_index4));
+                *((READ_TYPE*)b_dat5)  = *((const READ_TYPE*)(b + Boffset + b_index5));
+                *((READ_TYPE*)b_dat6)  = *((const READ_TYPE*)(b + Boffset + b_index6));
+                *((READ_TYPE*)b_dat7)  = *((const READ_TYPE*)(b + Boffset + b_index7));
+                *((READ_TYPE*)b_dat8)  = *((const READ_TYPE*)(b + Boffset + b_index8));
+                *((READ_TYPE*)b_dat9)  = *((const READ_TYPE*)(b + Boffset + b_index9));
+                *((READ_TYPE*)b_dat10) = *((const READ_TYPE*)(b + Boffset + b_index10));
+                *((READ_TYPE*)b_dat11) = *((const READ_TYPE*)(b + Boffset + b_index11));
+                *((READ_TYPE*)b_dat12) = *((const READ_TYPE*)(b + Boffset + b_index12));
+                *((READ_TYPE*)b_dat13) = *((const READ_TYPE*)(b + Boffset + b_index13));
+                *((READ_TYPE*)b_dat14) = *((const READ_TYPE*)(b + Boffset + b_index14));
+                *((READ_TYPE*)b_dat15) = *((const READ_TYPE*)(b + Boffset + b_index15));
+                *((READ_TYPE*)b_dat16) = *((const READ_TYPE*)(b + Boffset + b_index16));
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat2[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat3[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat4[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat5[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat6[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat7[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat8[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat9[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat10[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat11[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat12[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat13[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat14[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat15[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat16[i] * alpha1);
+                }
+            }
+            for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
+            {
+                int b_index           = bid * b_nstride + g_RD_BLCK;
+                *((READ_TYPE*)b_dat1) = *((const READ_TYPE*)(b + Boffset + b_index));
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
+                }
+            }
+        }
+        else
+        {
+            for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
+            {
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                }
+            }
+            for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
+            {
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0);
+                }
+            }
+        }
+        *((READ_TYPE*)(c + Coffset + io_index)) = *((READ_TYPE*)c_dat);
+    }
+}
 #endif
 
 #ifdef USE_3D_TENSOR_GENERIC
@@ -227,7 +418,6 @@ extern "C" __global__ void Op3dTensorGeneric(const MIOPEN_TYPE* a,
         gid += step;
     }
 }
-
 #endif
 
 #ifdef USE_4D_TENSOR_GENERIC
@@ -342,7 +532,6 @@ extern "C" __global__ void Op4dTensorGeneric(MIOPEN_TYPE* a,
         }
     }
 }
-
 #endif
 
 #ifdef USE_4D_TENSOR_LITE
@@ -358,7 +547,7 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                                           const long total_work,
                                           const int use_beta)
 {
-    int gid0 = blockIdx.x * blockDim.x + threadIdx.x;
+    int gid0        = blockIdx.x * blockDim.x + threadIdx.x;
     int global_size = gridDim.x * blockDim.x;
 
     MIOPEN_TYPE a_dat[RD_BLCK];
@@ -379,11 +568,14 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                 c_dat[i] = static_cast<MIOPEN_TYPE>(0);
             }
 
-            *(reinterpret_cast<READ_TYPE*>(a_dat)) = *(reinterpret_cast<const READ_TYPE*>(a + index + Aoffset));
-            *(reinterpret_cast<READ_TYPE*>(b_dat)) = *(reinterpret_cast<const READ_TYPE*>(b + index + Boffset));
+            *(reinterpret_cast<READ_TYPE*>(a_dat)) =
+                *(reinterpret_cast<const READ_TYPE*>(a + index + Aoffset));
+            *(reinterpret_cast<READ_TYPE*>(b_dat)) =
+                *(reinterpret_cast<const READ_TYPE*>(b + index + Boffset));
             if(use_beta == 1)
             {
-                *(reinterpret_cast<READ_TYPE*>(c_dat)) = *(reinterpret_cast<const READ_TYPE*>(c + index + Coffset));
+                *(reinterpret_cast<READ_TYPE*>(c_dat)) =
+                    *(reinterpret_cast<const READ_TYPE*>(c + index + Coffset));
             }
 
             for(int i = 0; i < RD_BLCK; ++i)
@@ -395,7 +587,8 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                 c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1);
             }
 
-            *(reinterpret_cast<READ_TYPE*>(c + index + Coffset)) = *(reinterpret_cast<READ_TYPE*>(c_dat));
+            *(reinterpret_cast<READ_TYPE*>(c + index + Coffset)) =
+                *(reinterpret_cast<READ_TYPE*>(c_dat));
         }
     }
     else
@@ -409,11 +602,14 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                 c_dat[i] = (MIOPEN_TYPE)0;
             }
 
-            *(reinterpret_cast<READ_TYPE*>(a_dat)) = *(reinterpret_cast<const READ_TYPE*>(a + index + Aoffset));
-            *(reinterpret_cast<READ_TYPE*>(b_dat)) = *(reinterpret_cast<const READ_TYPE*>(b + index + Boffset));
+            *(reinterpret_cast<READ_TYPE*>(a_dat)) =
+                *(reinterpret_cast<const READ_TYPE*>(a + index + Aoffset));
+            *(reinterpret_cast<READ_TYPE*>(b_dat)) =
+                *(reinterpret_cast<const READ_TYPE*>(b + index + Boffset));
             if(use_beta == 1)
             {
-                *(reinterpret_cast<READ_TYPE*>(c_dat)) = *(reinterpret_cast<const READ_TYPE*>(c + index + Coffset));
+                *(reinterpret_cast<READ_TYPE*>(c_dat)) =
+                    *(reinterpret_cast<const READ_TYPE*>(c + index + Coffset));
             }
 
             for(int i = 0; i < RD_BLCK; ++i)
@@ -425,8 +621,121 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a,
                 c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1);
             }
 
-            *(reinterpret_cast<READ_TYPE*>(c + index + Coffset)) = *(reinterpret_cast<READ_TYPE*>(c_dat));
+            *(reinterpret_cast<READ_TYPE*>(c + index + Coffset)) =
+                *(reinterpret_cast<READ_TYPE*>(c_dat));
+        }
+    }
+}
+#endif // USE_4D_TENSOR_LITE
+
+#ifdef USE_2D_TENSOR_LITE
+extern "C" __global__ void Op2dTensorLite(const MIOPEN_TYPE* a,
+                                          const int a_nstride,
+                                          const MIOPEN_TYPE* b,
+                                          const int b_nstride,
+                                          MIOPEN_TYPE* c,
+                                          const int c_nstride,
+                                          const MIOPEN_TYPE alpha0,
+                                          const MIOPEN_TYPE alpha1,
+                                          const MIOPEN_TYPE beta,
+                                          const uint64_t Aoffset,
+                                          const uint64_t Boffset,
+                                          const uint64_t Coffset,
+                                          const long total_work,
+                                          const long total_work2,
+                                          const int use_beta,
+                                          const int use_bias)
+{
+    int gid0 = blockIdx.x * blockDim.x + threadIdx.x;
+    int gid1 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    MIOPEN_TYPE a_dat[RD_BLCK];
+    MIOPEN_TYPE b_dat[RD_BLCK];
+    MIOPEN_TYPE c_dat[RD_BLCK];
+
+    if(gid0 < total_work)
+    {
+        if(use_bias == 1)
+        {
+            int b_index          = gid0 * RD_BLCK;
+            *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index));
+        }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+        if(beta == static_cast<MIOPEN_TYPE>(0))
+#pragma clang diagnostic pop
+        {
+            for(; gid1 < total_work2; gid1 += gridDim.y * blockDim.y)
+            {
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] = (MIOPEN_TYPE)0;
+                }
+
+                int a_index = gid1 * a_nstride + gid0 * RD_BLCK;
+                int c_index = gid1 * c_nstride + gid0 * RD_BLCK;
+
+                *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + a_index));
+                if(use_beta == 1)
+                {
+                    *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + c_index));
+                }
+
+                if(use_bias == 0)
+                {
+                    int b_index          = gid1 * b_nstride + gid0 * RD_BLCK;
+                    *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index));
+                }
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    if(use_beta == 1)
+                    {
+                        c_dat[i] = (MIOPEN_TYPE)0;
+                    }
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1);
+                }
+
+                *((READ_TYPE*)(c + Coffset + c_index)) = *((READ_TYPE*)c_dat);
+            }
+        }
+        else
+        {
+            for(; gid1 < total_work2; gid1 += gridDim.y * blockDim.y)
+            {
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] = (MIOPEN_TYPE)0;
+                }
+
+                int a_index = gid1 * a_nstride + gid0 * RD_BLCK;
+                int c_index = gid1 * c_nstride + gid0 * RD_BLCK;
+
+                *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + a_index));
+                if(use_beta == 1)
+                {
+                    *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + c_index));
+                }
+
+                if(use_bias == 0)
+                {
+                    int b_index          = gid1 * b_nstride + gid0 * RD_BLCK;
+                    *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index));
+                }
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    if(use_beta == 1)
+                    {
+                        c_dat[i] *= beta;
+                    }
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1);
+                }
+
+                *((READ_TYPE*)(c + Coffset + c_index)) = *((READ_TYPE*)c_dat);
+            }
         }
     }
 }
-#endif // USE_4D_TENSOR_LITE
\ No newline at end of file
+#endif
diff --git a/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp b/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp
index 263e9a6bc1e..6e629b97b57 100644
--- a/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp
+++ b/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp
@@ -57,7 +57,7 @@ void BatchNormFusedInferencGPU(const miopen::Handle& handle,
                                ConstData_t estimatedMean,
                                ConstData_t estimatedVariance,
                                double epsilon,
-                               PerfHelper<float>& perf_helper,
+                               PerfHelper& perf_helper,
                                bool use_hip)
 {
     int n, c, h, w;
@@ -352,8 +352,7 @@ struct BatchNormInferFusedTest
     const float activ_beta  = static_cast<float>(0.5f);
     const float activ_gamma = static_cast<float>(0.5f);
     double epsilon          = 1.0e-5;
-    // GetKernelTime returns time in float
-    PerfHelper<float> perf_helper;
+    PerfHelper perf_helper;
 };
 
 template <typename XDataType,
diff --git a/projects/miopen/test/gtest/perf_helper.hpp b/projects/miopen/test/gtest/perf_helper.hpp
index 00fef46fbe6..4789c8ef68a 100644
--- a/projects/miopen/test/gtest/perf_helper.hpp
+++ b/projects/miopen/test/gtest/perf_helper.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -29,36 +29,32 @@
 #define NUM_PERF_RUNS 5
 #define NUM_WARMUP_RUNS 3
 
-template <typename T>
 struct PerfHelper
 {
-    std::vector<std::tuple<std::string, T, T, double, double, double>> kernelTestStats;
+    std::vector<std::tuple<std::string, double, double, double, double, double>> kernelTestStats;
 
-    // hold the min, max, mean, median, and standard deviation
-    std::tuple<T, T, double, double, double> gpuStats;
-
-    static T perf_min(const std::vector<T>& data)
+    static double perf_min(const std::vector<double>& data)
     {
         if(data.empty())
             throw std::invalid_argument("Empty vector");
         return *std::min_element(data.begin(), data.end());
     }
 
-    static T perf_max(const std::vector<T>& data)
+    static double perf_max(const std::vector<double>& data)
     {
         if(data.empty())
             throw std::invalid_argument("Empty vector");
         return *std::max_element(data.begin(), data.end());
     }
 
-    static double perf_mean(const std::vector<T>& data)
+    static double perf_mean(const std::vector<double>& data)
     {
         if(data.empty())
             throw std::invalid_argument("Empty vector");
         return std::accumulate(data.begin(), data.end(), 0.0) / data.size();
     }
 
-    static double perf_median(std::vector<T> data)
+    static double perf_median(std::vector<double> data)
     {
         if(data.empty())
             throw std::invalid_argument("Empty vector");
@@ -74,26 +70,29 @@ struct PerfHelper
         }
     }
 
-    static double perf_standardDeviation(const std::vector<T>& data)
+    static double perf_standardDeviation(const std::vector<double>& data)
     {
         if(data.empty())
             throw std::invalid_argument("Empty vector");
         double data_mean = perf_mean(data);
         double sq_sum    = std::inner_product(
-            data.begin(), data.end(), data.begin(), 0.0, std::plus<>(), [data_mean](T a, T b) {
-                return (a - data_mean) * (b - data_mean);
-            });
+            data.begin(),
+            data.end(),
+            data.begin(),
+            0.0,
+            std::plus<>(),
+            [data_mean](double a, double b) { return (a - data_mean) * (b - data_mean); });
         return std::sqrt(sq_sum / data.size());
     }
 
-    static std::tuple<T, T, double, double, double> calcStats(const std::vector<T>& data)
+    static auto calcStats(const std::vector<double>& data)
     {
-        T min_val         = perf_min(data);
-        T max_val         = perf_max(data);
+        double min_val    = perf_min(data);
+        double max_val    = perf_max(data);
         double mean_val   = perf_mean(data);
         double median_val = perf_median(data); // Note: This modifies the data(sorts it)
         double sd_val     = perf_standardDeviation(data);
-        return {min_val, max_val, mean_val, median_val, sd_val};
+        return std::make_tuple(min_val, max_val, mean_val, median_val, sd_val);
     }
 
     void writeStatsToCSV(const std::string& filename, std::string test_info)
@@ -154,25 +153,17 @@ struct PerfHelper
     void perfTest(const miopen::Handle& handle,
                   const std::string& kernel_name,
                   const std::string& network_config,
-                  bool append,
                   Args&&... args)
     {
+        miopen::AutoEnableProfiling autoProfiling(handle);
+
         // Get kernels matching the kernel_name and network_config from the cache
         auto&& kernels = handle.GetKernels(kernel_name, network_config);
         // Ensure we have at least one kernel
         assert(!kernels.empty());
-        // Vector to hold the execution times
-        std::vector<T> elapsedTime_ms;
 
-        if(handle.IsProfilingEnabled())
-        { // If profiling was enabled elsewhere, reset the kernel time
-            handle.ResetKernelTime();
-        }
-        else
-        {
-            handle.EnableProfiling(); // Enable profiling
-            handle.ResetKernelTime(); // for good measure?
-        }
+        // Vector to hold the execution times
+        std::vector<double> elapsedTime_ms;
         // Optionally ignore the first few runs to allow for warm-up
         for(size_t i = 0; i < NUM_PERF_RUNS + NUM_WARMUP_RUNS; i++)
         {
@@ -184,9 +175,8 @@ struct PerfHelper
             handle.ResetKernelTime();
         }
 
-        handle.EnableProfiling(false); // Disable profiling
-
-        gpuStats = calcStats(elapsedTime_ms);
+        // Calculate the min, max, mean, median, and standard deviation
+        auto gpuStats = calcStats(elapsedTime_ms);
         kernelTestStats.push_back({kernel_name,
                                    std::get<0>(gpuStats),
                                    std::get<1>(gpuStats),
diff --git a/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp b/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp
new file mode 100644
index 00000000000..c71cdfdf4d7
--- /dev/null
+++ b/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp
@@ -0,0 +1,346 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/datatype.hpp>
+#include <gtest/gtest.h>
+
+#include "get_handle.hpp"
+#include "verify.hpp"
+
+#define PERF_ENABLE 1
+#if PERF_ENABLE
+#include "perf_helper.hpp"
+#endif
+
+struct TensorsConfig
+{
+    std::vector<int> aclens;
+    std::vector<int> acstrides;
+    std::vector<int> blens;
+    std::vector<int> bstrides;
+};
+
+template <typename T>
+std::vector<TensorsConfig> TensorsConfigs()
+{
+    std::vector<TensorsConfig> configs;
+#define MiB (1024ul * 1024ul)
+
+#if PERF_ENABLE
+    for(int N = (1 * MiB); N <= (1024 * MiB); N *= 2)
+    {
+        configs.push_back({{1, 1, N}, {N, N, 1}, {1, 1, N}, {N, N, 1}});
+    }
+    return configs;
+#else
+    int C = 4;
+    int N = 20 * 1024;
+    configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}});
+    C = 1;
+    N = 64 * MiB;
+    configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}});
+    N = 256 * MiB;
+    configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}});
+    N = 1024 * MiB;
+    configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}});
+    return configs;
+#endif
+}
+
+template <typename T>
+struct Op2dTensorLiteTest
+    : public ::testing::TestWithParam<std::tuple<TensorsConfig, float, float, float>>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle                                 = get_handle();
+        std::tie(tensorsConfig, alpha0, alpha1, beta) = GetParam();
+
+        data_type = miopen_type<T>{};
+
+        // Generate elements in tensors
+        tensA = tensor<T>{tensorsConfig.aclens, tensorsConfig.acstrides}.generate(
+            tensor_elem_gen_integer{17});
+        tensB = tensor<T>{tensorsConfig.blens, tensorsConfig.bstrides}.generate(
+            tensor_elem_gen_integer{17});
+        tensC = tensor<T>{tensorsConfig.aclens, tensorsConfig.acstrides}.generate(
+            [](auto...) { return 1; });
+
+        // Write the device tensors
+        tensA_dev = handle.Write(tensA.data);
+        tensB_dev = handle.Write(tensB.data);
+
+        // Allocate output tensors for OCL and HIP
+        tensC_ocl = tensor<T>{tensorsConfig.aclens, tensorsConfig.acstrides};
+        tensC_hip = tensor<T>{tensorsConfig.aclens, tensorsConfig.acstrides};
+
+        // Prepare all parameters needed for kernel
+        auto first_not_one = std::find_if(
+            tensorsConfig.blens.rbegin(), tensorsConfig.blens.rend(), [](int i) { return i != 1; });
+        auto d = std::distance(tensorsConfig.blens.begin(), first_not_one.base());
+
+        int num_wg = first_not_one != tensorsConfig.blens.rend()
+                         ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                         : 1;
+        for(int i = (d - 2); i >= 0; i--)
+        {
+            if(tensorsConfig.blens[i] != 1)
+            {
+                num_wg *= tensorsConfig.blens[i];
+            }
+        }
+
+        long max_num_wg = 4096;
+        num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+        auto len     = tensorsConfig.aclens[2];
+        auto RD_BLCK = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1;
+
+        const std::string MIOPEN_TYPE = miopen::GetDataType(data_type);
+        const std::string READ_TYPE =
+            (RD_BLCK == 1) ? MIOPEN_TYPE : MIOPEN_TYPE + std::to_string(RD_BLCK);
+
+        params = " -DMIOPEN_TYPE=" + MIOPEN_TYPE + " -DREAD_TYPE=" + READ_TYPE +
+                 " -DRD_BLCK=" + std::to_string(RD_BLCK);
+        params += " -DMIOPEN_TENSOR_OP=miopenAdd -DUSE_2D_TENSOR_LITE";
+
+        total_work         = std::max(len / RD_BLCK, 1);
+        long local_threads = 256;
+        long grp_sz        = (total_work + local_threads - 1) / local_threads;
+        grp_sz             = std::min(max_num_wg, grp_sz);
+        long glb_sz        = local_threads * grp_sz;
+
+        total_work2         = tensorsConfig.aclens[1];
+        long local_threads2 = 64;
+        long grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
+        grp_sz2             = std::min((max_num_wg / grp_sz), grp_sz2);
+        long glb_sz2        = local_threads2 * grp_sz2;
+
+        vld = {local_threads, 1, 1};
+        vgd = {glb_sz, glb_sz2, 1};
+
+        network_config += std::to_string(data_type) + "-miopenTensorOpAdd-";
+
+        use_beta = !miopen::float_equal(beta, 0);
+        use_bias = (tensorsConfig.blens[1] == 1);
+    }
+
+    void runOCL() // run OCL kernel
+    {
+        auto&& handle = get_handle();
+        // Write data to device tensor
+        tensC_dev = handle.Write(tensC.data);
+
+        std::string paramsOCL =
+            params + " " + miopen::GetDataTypeKBP(data_type).GenerateFor(miopen::kbp::OpenCL{});
+
+        std::string program_name       = "MIOpenTensorKernels.cl";
+        std::string network_config_ocl = network_config + "-ocl";
+
+        handle.AddKernel("Op2dTensorLite",
+                         network_config_ocl,
+                         program_name,
+                         "Op2dTensorLite",
+                         vld,
+                         vgd,
+                         paramsOCL)(tensA_dev.get(),
+                                    tensorsConfig.acstrides[0],
+                                    tensB_dev.get(),
+                                    tensorsConfig.bstrides[0],
+                                    tensC_dev.get(),
+                                    tensorsConfig.acstrides[0],
+                                    alpha0,
+                                    alpha1,
+                                    beta,
+                                    uint64_t(0),
+                                    uint64_t(0),
+                                    uint64_t(0),
+                                    total_work,
+                                    total_work2,
+                                    use_beta,
+                                    use_bias);
+
+        tensC_ocl.data = handle.Read<T>(tensC_dev, tensC_ocl.data.size());
+
+#if PERF_ENABLE
+        ph.perfTest(handle,
+                    "Op2dTensorLite",
+                    network_config_ocl,
+                    tensA_dev.get(),
+                    tensorsConfig.acstrides[0],
+                    tensB_dev.get(),
+                    tensorsConfig.bstrides[0],
+                    tensC_dev.get(),
+                    tensorsConfig.acstrides[0],
+                    alpha0,
+                    alpha1,
+                    beta,
+                    uint64_t(0),
+                    uint64_t(0),
+                    uint64_t(0),
+                    total_work,
+                    total_work2,
+                    use_beta,
+                    use_bias);
+#endif
+    }
+
+    void runHIP() // run HIP kernel
+    {
+        auto&& handle = get_handle();
+        tensC_dev     = handle.Write(tensC.data);
+
+        std::string paramsHIP =
+            params + " " + miopen::GetDataTypeKBP(data_type).GenerateFor(miopen::kbp::HIP{});
+
+        std::string program_name       = "MIOpenTensorKernelsHip.cpp";
+        std::string network_config_hip = network_config + "-hip";
+
+        handle.AddKernel("Op2dTensorLite",
+                         network_config_hip,
+                         program_name,
+                         "Op2dTensorLite",
+                         vld,
+                         vgd,
+                         paramsHIP)(tensA_dev.get(),
+                                    tensorsConfig.acstrides[0],
+                                    tensB_dev.get(),
+                                    tensorsConfig.bstrides[0],
+                                    tensC_dev.get(),
+                                    tensorsConfig.acstrides[0],
+                                    alpha0,
+                                    alpha1,
+                                    beta,
+                                    uint64_t(0),
+                                    uint64_t(0),
+                                    uint64_t(0),
+                                    total_work,
+                                    total_work2,
+                                    use_beta,
+                                    use_bias);
+
+        tensC_hip.data = handle.Read<T>(tensC_dev, tensC_hip.data.size());
+
+#if PERF_ENABLE
+        ph.perfTest(handle,
+                    "Op2dTensorLite",
+                    network_config_hip,
+                    tensA_dev.get(),
+                    tensorsConfig.acstrides[0],
+                    tensB_dev.get(),
+                    tensorsConfig.bstrides[0],
+                    tensC_dev.get(),
+                    tensorsConfig.acstrides[0],
+                    alpha0,
+                    alpha1,
+                    beta,
+                    uint64_t(0),
+                    uint64_t(0),
+                    uint64_t(0),
+                    total_work,
+                    total_work2,
+                    use_beta,
+                    use_bias);
+#endif
+    }
+
+    void verify()
+    {
+        auto error = miopen::rms_range(tensC_ocl, tensC_hip);
+        EXPECT_TRUE(error == 0) << "GPU outputs do not match each other. Error: " << error;
+    }
+
+    void TearDown() override
+    {
+#if PERF_ENABLE
+        std::string stats{};
+        stats += "_aclens_" + std::to_string(tensorsConfig.aclens[0]) + "_" +
+                 std::to_string(tensorsConfig.aclens[1]) + "_" +
+                 std::to_string(tensorsConfig.aclens[2]) + "_" +
+                 std::to_string(tensorsConfig.aclens[3]) + "_acstrides_" +
+                 std::to_string(tensorsConfig.acstrides[0]) + "_" +
+                 std::to_string(tensorsConfig.acstrides[1]) + "_" +
+                 std::to_string(tensorsConfig.acstrides[2]) + "_" +
+                 std::to_string(tensorsConfig.acstrides[3]);
+        stats += "_blens_" + std::to_string(tensorsConfig.blens[0]) + "_" +
+                 std::to_string(tensorsConfig.blens[1]) + "_" +
+                 std::to_string(tensorsConfig.blens[2]) + "_" +
+                 std::to_string(tensorsConfig.blens[3]) + "_bstrides_" +
+                 std::to_string(tensorsConfig.bstrides[0]) + "_" +
+                 std::to_string(tensorsConfig.bstrides[1]) + "_" +
+                 std::to_string(tensorsConfig.bstrides[2]) + "_" +
+                 std::to_string(tensorsConfig.bstrides[3]);
+        stats += "_alpha0_" + std::to_string(alpha0) + "_alpha1_" + std::to_string(alpha1) +
+                 "_beta_" + std::to_string(beta) + "_" + miopen::GetDataType(data_type);
+
+        ph.writeStatsToCSV("tensor_2d_lite.csv", stats);
+#endif
+    }
+
+    std::string network_config{};
+    std::string params{};
+    std::vector<size_t> vld, vgd;
+
+    tensor<T> tensA;
+    tensor<T> tensB;
+    tensor<T> tensC;
+    tensor<T> tensC_ocl;
+    tensor<T> tensC_hip;
+
+    miopenDataType_t data_type;
+
+    miopen::Allocator::ManageDataPtr tensA_dev;
+    miopen::Allocator::ManageDataPtr tensB_dev;
+    miopen::Allocator::ManageDataPtr tensC_dev;
+
+    TensorsConfig tensorsConfig;
+    T alpha0, alpha1, beta;
+    long total_work, total_work2;
+    int use_beta, use_bias;
+
+#if PERF_ENABLE
+    PerfHelper ph;
+#endif
+};
+
+using GPU_Op2dTensorLiteTest_FP32 = Op2dTensorLiteTest<float>;
+
+TEST_P(GPU_Op2dTensorLiteTest_FP32, PortTest)
+{
+    // run OCL kernel
+    runOCL();
+    // run HIP kernel
+    runHIP();
+    // verify if the output tensors are same
+    verify();
+}
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Op2dTensorLiteTest_FP32,
+                         testing::Combine(testing::ValuesIn(TensorsConfigs<float>()),
+                                          testing::Values(1.0f),
+                                          testing::Values(1.0f),
+                                          testing::Values(0.0f, 1.0f)));
diff --git a/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp b/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp
index 1183cf41352..3675a68289c 100644
--- a/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp
+++ b/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp
@@ -23,21 +23,16 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "get_handle.hpp"
-#include "random.hpp"
-#include <verify.hpp>
-#include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
-#include <miopen/kernel_build_params.hpp>
 #include <gtest/gtest.h>
 
-#include <tensor_util.hpp>
+#include "get_handle.hpp"
+#include "verify.hpp"
 
 #include "perf_helper.hpp"
-#include <miopen/float_equal.hpp>
 
 #define MAX_TENSOR_ELEM 17
-#define PERF_ENABLE 0
+#define PERF_ENABLE 1
 #define POW_2 1
 
 struct TensorsConfig
@@ -89,31 +84,30 @@ std::vector<TensorsConfig> TensorsConfigs()
 
     if constexpr(PERF_ENABLE)
     {
-
         const auto& handle = get_handle();
+        auto deviceName = handle.GetDeviceName();
         size_t maxTotalSize;
-
         // Generate all NCHW tensors that are limited by L3 cache size
         // or 2xL2 cache size when L3 is not available
-        if(miopen::StartsWith(handle.GetDeviceName(), "gfx90a") ||
-           miopen::StartsWith(handle.GetDeviceName(), "gfx908"))
+        if(miopen::StartsWith(deviceName, "gfx90a") ||
+           miopen::StartsWith(deviceName, "gfx908"))
         {
             maxTotalSize = 16; // twice the 8MB L2
         }
-        else if(miopen::StartsWith(handle.GetDeviceName(), "gfx803"))
+        else if(miopen::StartsWith(deviceName, "gfx803"))
         {
             maxTotalSize = 4; // twice the 2MB L2
         }
-        else if(miopen::StartsWith(handle.GetDeviceName(), "gfx900") ||
-                miopen::StartsWith(handle.GetDeviceName(), "gfx906"))
+        else if(miopen::StartsWith(deviceName, "gfx900") ||
+                miopen::StartsWith(deviceName, "gfx906"))
         {
             maxTotalSize = 8; // twice the 4MB L2
         }
-        else if(miopen::StartsWith(handle.GetDeviceName(), "gfx942"))
+        else if(miopen::StartsWith(deviceName, "gfx942"))
         {
             maxTotalSize = 256; // 256MB L3
         }
-        else if(miopen::StartsWith(handle.GetDeviceName(), "gfx103"))
+        else if(miopen::StartsWith(deviceName, "gfx103"))
         {
             maxTotalSize = 128; // 128MB L3
         }
@@ -121,7 +115,6 @@ std::vector<TensorsConfig> TensorsConfigs()
         {
             maxTotalSize = 4; // twice the 2MB L2, default case.
         }
-
         maxTotalSize = maxTotalSize * 1024ull * 1024ull / sizeof(T);
 
         if constexpr(POW_2)
@@ -246,7 +239,6 @@ struct Op4DTensorGenericTest
                                       std::multiplies<int>());
 
         bitmap = 0;
-
         bitmap |= (1 << (tensorsConfig.blens.size() - d));
 
         for(int i = (d - 2); i >= 0; i--)
@@ -262,17 +254,9 @@ struct Op4DTensorGenericTest
             }
         }
 
-        // if(std::accumulate(tensorsConfig.blens.begin(),
-        //                    tensorsConfig.blens.end(),
-        //                    4,
-        //                    std::multiplies<std::size_t>()) == 1)
-        // {
-        //     bitmap = 4;
-        // }
-
         num_wg_orig = num_wg;
-        max_num_wg  = 4096;
-        num_wg      = num_wg > max_num_wg ? max_num_wg : num_wg;
+        max_num_wg = 4096;
+        num_wg = num_wg > max_num_wg ? max_num_wg : num_wg;
 
         size_t local_threads = 256;
 
@@ -289,7 +273,6 @@ struct Op4DTensorGenericTest
 
     void runCPU()
     {
-
         std::vector<T> A = tensA.data;
         std::vector<T> B = tensB.data;
         std::vector<T> C = tensC.data;
@@ -374,7 +357,7 @@ struct Op4DTensorGenericTest
                                  static_cast<int64_t>(0),
                                  static_cast<int64_t>(0),
                                  static_cast<int64_t>(0),
-                                 static_cast<int>(num_wg_orig));
+                                 num_wg_orig);
 
         tensC_ocl.data = handle.Read<T>(tensC_dev, tensC_ocl.data.size());
 
@@ -383,7 +366,6 @@ struct Op4DTensorGenericTest
             ph.perfTest(handle,
                         "Op4dTensorGeneric",
                         network_config_ocl,
-                        false,
                         tensA_dev.get(),
                         static_cast<int>(tensorsConfig.acstrides[0]),
                         static_cast<int>(tensorsConfig.acstrides[1]),
@@ -410,7 +392,7 @@ struct Op4DTensorGenericTest
                         static_cast<int64_t>(0),
                         static_cast<int64_t>(0),
                         static_cast<int64_t>(0),
-                        static_cast<int>(num_wg_orig));
+                        num_wg_orig);
         }
     }
 
@@ -461,7 +443,7 @@ struct Op4DTensorGenericTest
                                  static_cast<int64_t>(0),
                                  static_cast<int64_t>(0),
                                  static_cast<int64_t>(0),
-                                 static_cast<int>(num_wg_orig));
+                                 num_wg_orig);
 
         tensC_hip.data = handle.Read<T>(tensC_dev, tensC_hip.data.size());
 
@@ -470,7 +452,6 @@ struct Op4DTensorGenericTest
             ph.perfTest(handle,
                         "Op4dTensorGeneric",
                         network_config_hip,
-                        false,
                         tensA_dev.get(),
                         static_cast<int>(tensorsConfig.acstrides[0]),
                         static_cast<int>(tensorsConfig.acstrides[1]),
@@ -497,7 +478,7 @@ struct Op4DTensorGenericTest
                         static_cast<int64_t>(0),
                         static_cast<int64_t>(0),
                         static_cast<int64_t>(0),
-                        static_cast<int>(num_wg_orig));
+                        num_wg_orig);
         }
     }
 
@@ -510,7 +491,7 @@ struct Op4DTensorGenericTest
     void verifyCPU()
     {
         auto error = miopen::rms_range(tensC_hip, tensC_cpu);
-        EXPECT_TRUE(error == 0) << "GPU outputs do not match each other. Error: " << error;
+        EXPECT_TRUE(error == 0) << "GPU outputs do not match CPU results. Error: " << error;
     }
 
     void TearDown() override
@@ -537,7 +518,7 @@ struct Op4DTensorGenericTest
             stats += "_alpha0_" + std::to_string(alpha0) + "_alpha1_" + std::to_string(alpha1) +
                      "_beta_" + std::to_string(beta) + "_" + miopen::GetDataType(data_type);
 
-            ph.writeStatsToCSV("tensor_4d.csv", stats);
+            ph.writeStatsToCSV("tensor_4d_generic.csv", stats);
         }
     }
 
@@ -545,9 +526,7 @@ struct Op4DTensorGenericTest
     std::string params{};
     std::vector<size_t> vld, vgd;
     unsigned int bitmap;
-    int work_per_wg;
-    int num_wg_orig;
-    int max_num_wg;
+    int work_per_wg, num_wg_orig, max_num_wg;
 
     tensor<T> tensA;
     tensor<T> tensB;
@@ -565,7 +544,7 @@ struct Op4DTensorGenericTest
     TensorsConfig tensorsConfig;
     float alpha0, alpha1, beta;
 
-    PerfHelper<T> ph;
+    PerfHelper ph;
 };
 
 struct GPU_Op4dTensorGenericTest_FP32 : Op4DTensorGenericTest<float>
@@ -574,12 +553,12 @@ struct GPU_Op4dTensorGenericTest_FP32 : Op4DTensorGenericTest<float>
 
 TEST_P(GPU_Op4dTensorGenericTest_FP32, PortTest)
 {
+    // run CPU implementation
+//    runCPU();
     // run OCL kernel
     runOCL();
     // run HIP kernel
     runHIP();
-    // run CPU implementation
-    // runCPU();
     // verify if the output tensors are same
     verify();
 }