diff --git a/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp b/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp index 4a8311d1b97..0e8c25fc4a3 100644 --- a/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp +++ b/projects/miopen/src/kernels/MIOpenTensorKernelsHip.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2023 Advanced Micro Devices, Inc. + * Copyright (c) 2025 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,13 +34,13 @@ template __device__ T miopenAdd(T a, T b) { - return a + b; + return (a + b); } template __device__ T miopenMul(T a, T b) { - return a * b; + return (a * b); } template @@ -97,7 +97,6 @@ extern "C" __global__ void Op1dTensorGeneric(const MIOPEN_TYPE* a, c_ptr += c_step; } } - #endif #ifdef USE_2D_TENSOR_GENERIC @@ -154,7 +153,199 @@ extern "C" __global__ void Op2dTensorGeneric(const MIOPEN_TYPE* a, gid += step; } } +#endif + +#ifdef USE_2D_TENSOR_SQUASH +extern "C" __global__ void Op2dTensorSquash(const MIOPEN_TYPE* a, + const MIOPEN_TYPE* b, + const int b_c, + const int b_nstride, + MIOPEN_TYPE* c, + const MIOPEN_TYPE alpha0, + const MIOPEN_TYPE alpha1, + const MIOPEN_TYPE beta, + const long Aoffset, + const long Boffset, + const long Coffset, + const long total_work, + const int use_apl0, + const int use_apl1, + const int use_bet) +{ + MIOPEN_TYPE a_dat[RD_BLCK]; + MIOPEN_TYPE b_dat1[RD_BLCK]; + MIOPEN_TYPE b_dat2[RD_BLCK]; + MIOPEN_TYPE b_dat3[RD_BLCK]; + MIOPEN_TYPE b_dat4[RD_BLCK]; + MIOPEN_TYPE b_dat5[RD_BLCK]; + MIOPEN_TYPE b_dat6[RD_BLCK]; + MIOPEN_TYPE b_dat7[RD_BLCK]; + MIOPEN_TYPE b_dat8[RD_BLCK]; + MIOPEN_TYPE b_dat9[RD_BLCK]; + MIOPEN_TYPE b_dat10[RD_BLCK]; + MIOPEN_TYPE b_dat11[RD_BLCK]; + MIOPEN_TYPE b_dat12[RD_BLCK]; + MIOPEN_TYPE b_dat13[RD_BLCK]; + MIOPEN_TYPE b_dat14[RD_BLCK]; + MIOPEN_TYPE b_dat15[RD_BLCK]; + MIOPEN_TYPE b_dat16[RD_BLCK]; + MIOPEN_TYPE c_dat[RD_BLCK]; + int g_RD_BLCK; + + for(int i = 0; i < RD_BLCK; ++i) + { + b_dat1[i] = (MIOPEN_TYPE)0; + b_dat2[i] = (MIOPEN_TYPE)0; + b_dat3[i] = (MIOPEN_TYPE)0; + b_dat4[i] = (MIOPEN_TYPE)0; + b_dat5[i] = (MIOPEN_TYPE)0; + b_dat6[i] = (MIOPEN_TYPE)0; + b_dat7[i] = (MIOPEN_TYPE)0; + b_dat8[i] = (MIOPEN_TYPE)0; + b_dat9[i] = (MIOPEN_TYPE)0; + b_dat10[i] = (MIOPEN_TYPE)0; + b_dat11[i] = (MIOPEN_TYPE)0; + b_dat12[i] = (MIOPEN_TYPE)0; + b_dat13[i] = (MIOPEN_TYPE)0; + b_dat14[i] = (MIOPEN_TYPE)0; + b_dat15[i] = (MIOPEN_TYPE)0; + b_dat16[i] = (MIOPEN_TYPE)0; + } + + const int gid_ = blockIdx.x * blockDim.x + threadIdx.x; + const int global_size = gridDim.x * blockDim.x; + for(int gid = gid_; gid < total_work; gid += global_size) + { + for(int i = 0; i < RD_BLCK; ++i) + { + a_dat[i] = (MIOPEN_TYPE)0; + c_dat[i] = (MIOPEN_TYPE)0; + } + + int io_index = gid * RD_BLCK; + if(use_apl0 == 1) + { + *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + io_index)); + for(int i = 0; i < RD_BLCK; ++i) + { + a_dat[i] *= alpha0; + } + } + + if(use_bet == 1) + { + *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + io_index)); + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] *= beta; + } + } + + g_RD_BLCK = gid * RD_BLCK; + if(use_apl1 == 1) + { + for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16) + { + int b_index1 = (bid * b_nstride) + g_RD_BLCK; + int b_index2 = ((bid + 1) * b_nstride) + g_RD_BLCK; + int b_index3 = ((bid + 2) * b_nstride) + g_RD_BLCK; + int b_index4 = ((bid + 3) * b_nstride) + g_RD_BLCK; + int b_index5 = ((bid + 4) * b_nstride) + g_RD_BLCK; + int b_index6 = ((bid + 5) * b_nstride) + g_RD_BLCK; + int b_index7 = ((bid + 6) * b_nstride) + g_RD_BLCK; + int b_index8 = ((bid + 7) * b_nstride) + g_RD_BLCK; + int b_index9 = ((bid + 8) * b_nstride) + g_RD_BLCK; + int b_index10 = ((bid + 9) * b_nstride) + g_RD_BLCK; + int b_index11 = ((bid + 10) * b_nstride) + g_RD_BLCK; + int b_index12 = ((bid + 11) * b_nstride) + g_RD_BLCK; + int b_index13 = ((bid + 12) * b_nstride) + g_RD_BLCK; + int b_index14 = ((bid + 13) * b_nstride) + g_RD_BLCK; + int b_index15 = ((bid + 14) * b_nstride) + g_RD_BLCK; + int b_index16 = ((bid + 15) * b_nstride) + g_RD_BLCK; + *((READ_TYPE*)b_dat1) = *((const READ_TYPE*)(b + Boffset + b_index1)); + *((READ_TYPE*)b_dat2) = *((const READ_TYPE*)(b + Boffset + b_index2)); + *((READ_TYPE*)b_dat3) = *((const READ_TYPE*)(b + Boffset + b_index3)); + *((READ_TYPE*)b_dat4) = *((const READ_TYPE*)(b + Boffset + b_index4)); + *((READ_TYPE*)b_dat5) = *((const READ_TYPE*)(b + Boffset + b_index5)); + *((READ_TYPE*)b_dat6) = *((const READ_TYPE*)(b + Boffset + b_index6)); + *((READ_TYPE*)b_dat7) = *((const READ_TYPE*)(b + Boffset + b_index7)); + *((READ_TYPE*)b_dat8) = *((const READ_TYPE*)(b + Boffset + b_index8)); + *((READ_TYPE*)b_dat9) = *((const READ_TYPE*)(b + Boffset + b_index9)); + *((READ_TYPE*)b_dat10) = *((const READ_TYPE*)(b + Boffset + b_index10)); + *((READ_TYPE*)b_dat11) = *((const READ_TYPE*)(b + Boffset + b_index11)); + *((READ_TYPE*)b_dat12) = *((const READ_TYPE*)(b + Boffset + b_index12)); + *((READ_TYPE*)b_dat13) = *((const READ_TYPE*)(b + Boffset + b_index13)); + *((READ_TYPE*)b_dat14) = *((const READ_TYPE*)(b + Boffset + b_index14)); + *((READ_TYPE*)b_dat15) = *((const READ_TYPE*)(b + Boffset + b_index15)); + *((READ_TYPE*)b_dat16) = *((const READ_TYPE*)(b + Boffset + b_index16)); + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat2[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat3[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat4[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat5[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat6[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat7[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat8[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat9[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat10[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat11[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat12[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat13[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat14[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat15[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat16[i] * alpha1); + } + } + for(int bid = ((b_c / 16) * 16); bid < b_c; bid++) + { + int b_index = bid * b_nstride + g_RD_BLCK; + *((READ_TYPE*)b_dat1) = *((const READ_TYPE*)(b + Boffset + b_index)); + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1); + } + } + } + else + { + for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16) + { + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + } + } + for(int bid = ((b_c / 16) * 16); bid < b_c; bid++) + { + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], (MIOPEN_TYPE)0); + } + } + } + *((READ_TYPE*)(c + Coffset + io_index)) = *((READ_TYPE*)c_dat); + } +} #endif #ifdef USE_3D_TENSOR_GENERIC @@ -227,7 +418,6 @@ extern "C" __global__ void Op3dTensorGeneric(const MIOPEN_TYPE* a, gid += step; } } - #endif #ifdef USE_4D_TENSOR_GENERIC @@ -342,7 +532,6 @@ extern "C" __global__ void Op4dTensorGeneric(MIOPEN_TYPE* a, } } } - #endif #ifdef USE_4D_TENSOR_LITE @@ -358,7 +547,7 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a, const long total_work, const int use_beta) { - int gid0 = blockIdx.x * blockDim.x + threadIdx.x; + int gid0 = blockIdx.x * blockDim.x + threadIdx.x; int global_size = gridDim.x * blockDim.x; MIOPEN_TYPE a_dat[RD_BLCK]; @@ -379,11 +568,14 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a, c_dat[i] = static_cast(0); } - *(reinterpret_cast(a_dat)) = *(reinterpret_cast(a + index + Aoffset)); - *(reinterpret_cast(b_dat)) = *(reinterpret_cast(b + index + Boffset)); + *(reinterpret_cast(a_dat)) = + *(reinterpret_cast(a + index + Aoffset)); + *(reinterpret_cast(b_dat)) = + *(reinterpret_cast(b + index + Boffset)); if(use_beta == 1) { - *(reinterpret_cast(c_dat)) = *(reinterpret_cast(c + index + Coffset)); + *(reinterpret_cast(c_dat)) = + *(reinterpret_cast(c + index + Coffset)); } for(int i = 0; i < RD_BLCK; ++i) @@ -395,7 +587,8 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a, c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1); } - *(reinterpret_cast(c + index + Coffset)) = *(reinterpret_cast(c_dat)); + *(reinterpret_cast(c + index + Coffset)) = + *(reinterpret_cast(c_dat)); } } else @@ -409,11 +602,14 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a, c_dat[i] = (MIOPEN_TYPE)0; } - *(reinterpret_cast(a_dat)) = *(reinterpret_cast(a + index + Aoffset)); - *(reinterpret_cast(b_dat)) = *(reinterpret_cast(b + index + Boffset)); + *(reinterpret_cast(a_dat)) = + *(reinterpret_cast(a + index + Aoffset)); + *(reinterpret_cast(b_dat)) = + *(reinterpret_cast(b + index + Boffset)); if(use_beta == 1) { - *(reinterpret_cast(c_dat)) = *(reinterpret_cast(c + index + Coffset)); + *(reinterpret_cast(c_dat)) = + *(reinterpret_cast(c + index + Coffset)); } for(int i = 0; i < RD_BLCK; ++i) @@ -425,8 +621,121 @@ extern "C" __global__ void Op4dTensorLite(const MIOPEN_TYPE* a, c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1); } - *(reinterpret_cast(c + index + Coffset)) = *(reinterpret_cast(c_dat)); + *(reinterpret_cast(c + index + Coffset)) = + *(reinterpret_cast(c_dat)); + } + } +} +#endif // USE_4D_TENSOR_LITE + +#ifdef USE_2D_TENSOR_LITE +extern "C" __global__ void Op2dTensorLite(const MIOPEN_TYPE* a, + const int a_nstride, + const MIOPEN_TYPE* b, + const int b_nstride, + MIOPEN_TYPE* c, + const int c_nstride, + const MIOPEN_TYPE alpha0, + const MIOPEN_TYPE alpha1, + const MIOPEN_TYPE beta, + const uint64_t Aoffset, + const uint64_t Boffset, + const uint64_t Coffset, + const long total_work, + const long total_work2, + const int use_beta, + const int use_bias) +{ + int gid0 = blockIdx.x * blockDim.x + threadIdx.x; + int gid1 = blockIdx.y * blockDim.y + threadIdx.y; + + MIOPEN_TYPE a_dat[RD_BLCK]; + MIOPEN_TYPE b_dat[RD_BLCK]; + MIOPEN_TYPE c_dat[RD_BLCK]; + + if(gid0 < total_work) + { + if(use_bias == 1) + { + int b_index = gid0 * RD_BLCK; + *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index)); + } + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" + if(beta == static_cast(0)) +#pragma clang diagnostic pop + { + for(; gid1 < total_work2; gid1 += gridDim.y * blockDim.y) + { + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] = (MIOPEN_TYPE)0; + } + + int a_index = gid1 * a_nstride + gid0 * RD_BLCK; + int c_index = gid1 * c_nstride + gid0 * RD_BLCK; + + *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + a_index)); + if(use_beta == 1) + { + *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + c_index)); + } + + if(use_bias == 0) + { + int b_index = gid1 * b_nstride + gid0 * RD_BLCK; + *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index)); + } + + for(int i = 0; i < RD_BLCK; ++i) + { + if(use_beta == 1) + { + c_dat[i] = (MIOPEN_TYPE)0; + } + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1); + } + + *((READ_TYPE*)(c + Coffset + c_index)) = *((READ_TYPE*)c_dat); + } + } + else + { + for(; gid1 < total_work2; gid1 += gridDim.y * blockDim.y) + { + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] = (MIOPEN_TYPE)0; + } + + int a_index = gid1 * a_nstride + gid0 * RD_BLCK; + int c_index = gid1 * c_nstride + gid0 * RD_BLCK; + + *((READ_TYPE*)a_dat) = *((const READ_TYPE*)(a + Aoffset + a_index)); + if(use_beta == 1) + { + *((READ_TYPE*)c_dat) = *((const READ_TYPE*)(c + Coffset + c_index)); + } + + if(use_bias == 0) + { + int b_index = gid1 * b_nstride + gid0 * RD_BLCK; + *((READ_TYPE*)b_dat) = *((const READ_TYPE*)(b + Boffset + b_index)); + } + + for(int i = 0; i < RD_BLCK; ++i) + { + if(use_beta == 1) + { + c_dat[i] *= beta; + } + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i] * alpha0, b_dat[i] * alpha1); + } + + *((READ_TYPE*)(c + Coffset + c_index)) = *((READ_TYPE*)c_dat); + } } } } -#endif // USE_4D_TENSOR_LITE \ No newline at end of file +#endif diff --git a/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp b/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp index 263e9a6bc1e..6e629b97b57 100644 --- a/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp +++ b/projects/miopen/test/gtest/bn_infer_fused_ocl_hip.cpp @@ -57,7 +57,7 @@ void BatchNormFusedInferencGPU(const miopen::Handle& handle, ConstData_t estimatedMean, ConstData_t estimatedVariance, double epsilon, - PerfHelper& perf_helper, + PerfHelper& perf_helper, bool use_hip) { int n, c, h, w; @@ -352,8 +352,7 @@ struct BatchNormInferFusedTest const float activ_beta = static_cast(0.5f); const float activ_gamma = static_cast(0.5f); double epsilon = 1.0e-5; - // GetKernelTime returns time in float - PerfHelper perf_helper; + PerfHelper perf_helper; }; template struct PerfHelper { - std::vector> kernelTestStats; + std::vector> kernelTestStats; - // hold the min, max, mean, median, and standard deviation - std::tuple gpuStats; - - static T perf_min(const std::vector& data) + static double perf_min(const std::vector& data) { if(data.empty()) throw std::invalid_argument("Empty vector"); return *std::min_element(data.begin(), data.end()); } - static T perf_max(const std::vector& data) + static double perf_max(const std::vector& data) { if(data.empty()) throw std::invalid_argument("Empty vector"); return *std::max_element(data.begin(), data.end()); } - static double perf_mean(const std::vector& data) + static double perf_mean(const std::vector& data) { if(data.empty()) throw std::invalid_argument("Empty vector"); return std::accumulate(data.begin(), data.end(), 0.0) / data.size(); } - static double perf_median(std::vector data) + static double perf_median(std::vector data) { if(data.empty()) throw std::invalid_argument("Empty vector"); @@ -74,26 +70,29 @@ struct PerfHelper } } - static double perf_standardDeviation(const std::vector& data) + static double perf_standardDeviation(const std::vector& data) { if(data.empty()) throw std::invalid_argument("Empty vector"); double data_mean = perf_mean(data); double sq_sum = std::inner_product( - data.begin(), data.end(), data.begin(), 0.0, std::plus<>(), [data_mean](T a, T b) { - return (a - data_mean) * (b - data_mean); - }); + data.begin(), + data.end(), + data.begin(), + 0.0, + std::plus<>(), + [data_mean](double a, double b) { return (a - data_mean) * (b - data_mean); }); return std::sqrt(sq_sum / data.size()); } - static std::tuple calcStats(const std::vector& data) + static auto calcStats(const std::vector& data) { - T min_val = perf_min(data); - T max_val = perf_max(data); + double min_val = perf_min(data); + double max_val = perf_max(data); double mean_val = perf_mean(data); double median_val = perf_median(data); // Note: This modifies the data(sorts it) double sd_val = perf_standardDeviation(data); - return {min_val, max_val, mean_val, median_val, sd_val}; + return std::make_tuple(min_val, max_val, mean_val, median_val, sd_val); } void writeStatsToCSV(const std::string& filename, std::string test_info) @@ -154,25 +153,17 @@ struct PerfHelper void perfTest(const miopen::Handle& handle, const std::string& kernel_name, const std::string& network_config, - bool append, Args&&... args) { + miopen::AutoEnableProfiling autoProfiling(handle); + // Get kernels matching the kernel_name and network_config from the cache auto&& kernels = handle.GetKernels(kernel_name, network_config); // Ensure we have at least one kernel assert(!kernels.empty()); - // Vector to hold the execution times - std::vector elapsedTime_ms; - if(handle.IsProfilingEnabled()) - { // If profiling was enabled elsewhere, reset the kernel time - handle.ResetKernelTime(); - } - else - { - handle.EnableProfiling(); // Enable profiling - handle.ResetKernelTime(); // for good measure? - } + // Vector to hold the execution times + std::vector elapsedTime_ms; // Optionally ignore the first few runs to allow for warm-up for(size_t i = 0; i < NUM_PERF_RUNS + NUM_WARMUP_RUNS; i++) { @@ -184,9 +175,8 @@ struct PerfHelper handle.ResetKernelTime(); } - handle.EnableProfiling(false); // Disable profiling - - gpuStats = calcStats(elapsedTime_ms); + // Calculate the min, max, mean, median, and standard deviation + auto gpuStats = calcStats(elapsedTime_ms); kernelTestStats.push_back({kernel_name, std::get<0>(gpuStats), std::get<1>(gpuStats), diff --git a/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp b/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp new file mode 100644 index 00000000000..c71cdfdf4d7 --- /dev/null +++ b/projects/miopen/test/gtest/tensor_2d_lite_ocl_hip.cpp @@ -0,0 +1,346 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include + +#include "get_handle.hpp" +#include "verify.hpp" + +#define PERF_ENABLE 1 +#if PERF_ENABLE +#include "perf_helper.hpp" +#endif + +struct TensorsConfig +{ + std::vector aclens; + std::vector acstrides; + std::vector blens; + std::vector bstrides; +}; + +template +std::vector TensorsConfigs() +{ + std::vector configs; +#define MiB (1024ul * 1024ul) + +#if PERF_ENABLE + for(int N = (1 * MiB); N <= (1024 * MiB); N *= 2) + { + configs.push_back({{1, 1, N}, {N, N, 1}, {1, 1, N}, {N, N, 1}}); + } + return configs; +#else + int C = 4; + int N = 20 * 1024; + configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}}); + C = 1; + N = 64 * MiB; + configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}}); + N = 256 * MiB; + configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}}); + N = 1024 * MiB; + configs.push_back({{1, C, N}, {N * C, N, 1}, {1, C, N}, {N * C, N, 1}}); + return configs; +#endif +} + +template +struct Op2dTensorLiteTest + : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + std::tie(tensorsConfig, alpha0, alpha1, beta) = GetParam(); + + data_type = miopen_type{}; + + // Generate elements in tensors + tensA = tensor{tensorsConfig.aclens, tensorsConfig.acstrides}.generate( + tensor_elem_gen_integer{17}); + tensB = tensor{tensorsConfig.blens, tensorsConfig.bstrides}.generate( + tensor_elem_gen_integer{17}); + tensC = tensor{tensorsConfig.aclens, tensorsConfig.acstrides}.generate( + [](auto...) { return 1; }); + + // Write the device tensors + tensA_dev = handle.Write(tensA.data); + tensB_dev = handle.Write(tensB.data); + + // Allocate output tensors for OCL and HIP + tensC_ocl = tensor{tensorsConfig.aclens, tensorsConfig.acstrides}; + tensC_hip = tensor{tensorsConfig.aclens, tensorsConfig.acstrides}; + + // Prepare all parameters needed for kernel + auto first_not_one = std::find_if( + tensorsConfig.blens.rbegin(), tensorsConfig.blens.rend(), [](int i) { return i != 1; }); + auto d = std::distance(tensorsConfig.blens.begin(), first_not_one.base()); + + int num_wg = first_not_one != tensorsConfig.blens.rend() + ? static_cast(*first_not_one == 0 ? 1 : *first_not_one) + : 1; + for(int i = (d - 2); i >= 0; i--) + { + if(tensorsConfig.blens[i] != 1) + { + num_wg *= tensorsConfig.blens[i]; + } + } + + long max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + + auto len = tensorsConfig.aclens[2]; + auto RD_BLCK = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1; + + const std::string MIOPEN_TYPE = miopen::GetDataType(data_type); + const std::string READ_TYPE = + (RD_BLCK == 1) ? MIOPEN_TYPE : MIOPEN_TYPE + std::to_string(RD_BLCK); + + params = " -DMIOPEN_TYPE=" + MIOPEN_TYPE + " -DREAD_TYPE=" + READ_TYPE + + " -DRD_BLCK=" + std::to_string(RD_BLCK); + params += " -DMIOPEN_TENSOR_OP=miopenAdd -DUSE_2D_TENSOR_LITE"; + + total_work = std::max(len / RD_BLCK, 1); + long local_threads = 256; + long grp_sz = (total_work + local_threads - 1) / local_threads; + grp_sz = std::min(max_num_wg, grp_sz); + long glb_sz = local_threads * grp_sz; + + total_work2 = tensorsConfig.aclens[1]; + long local_threads2 = 64; + long grp_sz2 = (total_work2 + local_threads2 - 1) / local_threads2; + grp_sz2 = std::min((max_num_wg / grp_sz), grp_sz2); + long glb_sz2 = local_threads2 * grp_sz2; + + vld = {local_threads, 1, 1}; + vgd = {glb_sz, glb_sz2, 1}; + + network_config += std::to_string(data_type) + "-miopenTensorOpAdd-"; + + use_beta = !miopen::float_equal(beta, 0); + use_bias = (tensorsConfig.blens[1] == 1); + } + + void runOCL() // run OCL kernel + { + auto&& handle = get_handle(); + // Write data to device tensor + tensC_dev = handle.Write(tensC.data); + + std::string paramsOCL = + params + " " + miopen::GetDataTypeKBP(data_type).GenerateFor(miopen::kbp::OpenCL{}); + + std::string program_name = "MIOpenTensorKernels.cl"; + std::string network_config_ocl = network_config + "-ocl"; + + handle.AddKernel("Op2dTensorLite", + network_config_ocl, + program_name, + "Op2dTensorLite", + vld, + vgd, + paramsOCL)(tensA_dev.get(), + tensorsConfig.acstrides[0], + tensB_dev.get(), + tensorsConfig.bstrides[0], + tensC_dev.get(), + tensorsConfig.acstrides[0], + alpha0, + alpha1, + beta, + uint64_t(0), + uint64_t(0), + uint64_t(0), + total_work, + total_work2, + use_beta, + use_bias); + + tensC_ocl.data = handle.Read(tensC_dev, tensC_ocl.data.size()); + +#if PERF_ENABLE + ph.perfTest(handle, + "Op2dTensorLite", + network_config_ocl, + tensA_dev.get(), + tensorsConfig.acstrides[0], + tensB_dev.get(), + tensorsConfig.bstrides[0], + tensC_dev.get(), + tensorsConfig.acstrides[0], + alpha0, + alpha1, + beta, + uint64_t(0), + uint64_t(0), + uint64_t(0), + total_work, + total_work2, + use_beta, + use_bias); +#endif + } + + void runHIP() // run HIP kernel + { + auto&& handle = get_handle(); + tensC_dev = handle.Write(tensC.data); + + std::string paramsHIP = + params + " " + miopen::GetDataTypeKBP(data_type).GenerateFor(miopen::kbp::HIP{}); + + std::string program_name = "MIOpenTensorKernelsHip.cpp"; + std::string network_config_hip = network_config + "-hip"; + + handle.AddKernel("Op2dTensorLite", + network_config_hip, + program_name, + "Op2dTensorLite", + vld, + vgd, + paramsHIP)(tensA_dev.get(), + tensorsConfig.acstrides[0], + tensB_dev.get(), + tensorsConfig.bstrides[0], + tensC_dev.get(), + tensorsConfig.acstrides[0], + alpha0, + alpha1, + beta, + uint64_t(0), + uint64_t(0), + uint64_t(0), + total_work, + total_work2, + use_beta, + use_bias); + + tensC_hip.data = handle.Read(tensC_dev, tensC_hip.data.size()); + +#if PERF_ENABLE + ph.perfTest(handle, + "Op2dTensorLite", + network_config_hip, + tensA_dev.get(), + tensorsConfig.acstrides[0], + tensB_dev.get(), + tensorsConfig.bstrides[0], + tensC_dev.get(), + tensorsConfig.acstrides[0], + alpha0, + alpha1, + beta, + uint64_t(0), + uint64_t(0), + uint64_t(0), + total_work, + total_work2, + use_beta, + use_bias); +#endif + } + + void verify() + { + auto error = miopen::rms_range(tensC_ocl, tensC_hip); + EXPECT_TRUE(error == 0) << "GPU outputs do not match each other. Error: " << error; + } + + void TearDown() override + { +#if PERF_ENABLE + std::string stats{}; + stats += "_aclens_" + std::to_string(tensorsConfig.aclens[0]) + "_" + + std::to_string(tensorsConfig.aclens[1]) + "_" + + std::to_string(tensorsConfig.aclens[2]) + "_" + + std::to_string(tensorsConfig.aclens[3]) + "_acstrides_" + + std::to_string(tensorsConfig.acstrides[0]) + "_" + + std::to_string(tensorsConfig.acstrides[1]) + "_" + + std::to_string(tensorsConfig.acstrides[2]) + "_" + + std::to_string(tensorsConfig.acstrides[3]); + stats += "_blens_" + std::to_string(tensorsConfig.blens[0]) + "_" + + std::to_string(tensorsConfig.blens[1]) + "_" + + std::to_string(tensorsConfig.blens[2]) + "_" + + std::to_string(tensorsConfig.blens[3]) + "_bstrides_" + + std::to_string(tensorsConfig.bstrides[0]) + "_" + + std::to_string(tensorsConfig.bstrides[1]) + "_" + + std::to_string(tensorsConfig.bstrides[2]) + "_" + + std::to_string(tensorsConfig.bstrides[3]); + stats += "_alpha0_" + std::to_string(alpha0) + "_alpha1_" + std::to_string(alpha1) + + "_beta_" + std::to_string(beta) + "_" + miopen::GetDataType(data_type); + + ph.writeStatsToCSV("tensor_2d_lite.csv", stats); +#endif + } + + std::string network_config{}; + std::string params{}; + std::vector vld, vgd; + + tensor tensA; + tensor tensB; + tensor tensC; + tensor tensC_ocl; + tensor tensC_hip; + + miopenDataType_t data_type; + + miopen::Allocator::ManageDataPtr tensA_dev; + miopen::Allocator::ManageDataPtr tensB_dev; + miopen::Allocator::ManageDataPtr tensC_dev; + + TensorsConfig tensorsConfig; + T alpha0, alpha1, beta; + long total_work, total_work2; + int use_beta, use_bias; + +#if PERF_ENABLE + PerfHelper ph; +#endif +}; + +using GPU_Op2dTensorLiteTest_FP32 = Op2dTensorLiteTest; + +TEST_P(GPU_Op2dTensorLiteTest_FP32, PortTest) +{ + // run OCL kernel + runOCL(); + // run HIP kernel + runHIP(); + // verify if the output tensors are same + verify(); +} + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Op2dTensorLiteTest_FP32, + testing::Combine(testing::ValuesIn(TensorsConfigs()), + testing::Values(1.0f), + testing::Values(1.0f), + testing::Values(0.0f, 1.0f))); diff --git a/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp b/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp index 1183cf41352..3675a68289c 100644 --- a/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp +++ b/projects/miopen/test/gtest/tensor_4d_generic_ocl_hip.cpp @@ -23,21 +23,16 @@ * SOFTWARE. * *******************************************************************************/ -#include "get_handle.hpp" -#include "random.hpp" -#include -#include #include -#include #include -#include +#include "get_handle.hpp" +#include "verify.hpp" #include "perf_helper.hpp" -#include #define MAX_TENSOR_ELEM 17 -#define PERF_ENABLE 0 +#define PERF_ENABLE 1 #define POW_2 1 struct TensorsConfig @@ -89,31 +84,30 @@ std::vector TensorsConfigs() if constexpr(PERF_ENABLE) { - const auto& handle = get_handle(); + auto deviceName = handle.GetDeviceName(); size_t maxTotalSize; - // Generate all NCHW tensors that are limited by L3 cache size // or 2xL2 cache size when L3 is not available - if(miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || - miopen::StartsWith(handle.GetDeviceName(), "gfx908")) + if(miopen::StartsWith(deviceName, "gfx90a") || + miopen::StartsWith(deviceName, "gfx908")) { maxTotalSize = 16; // twice the 8MB L2 } - else if(miopen::StartsWith(handle.GetDeviceName(), "gfx803")) + else if(miopen::StartsWith(deviceName, "gfx803")) { maxTotalSize = 4; // twice the 2MB L2 } - else if(miopen::StartsWith(handle.GetDeviceName(), "gfx900") || - miopen::StartsWith(handle.GetDeviceName(), "gfx906")) + else if(miopen::StartsWith(deviceName, "gfx900") || + miopen::StartsWith(deviceName, "gfx906")) { maxTotalSize = 8; // twice the 4MB L2 } - else if(miopen::StartsWith(handle.GetDeviceName(), "gfx942")) + else if(miopen::StartsWith(deviceName, "gfx942")) { maxTotalSize = 256; // 256MB L3 } - else if(miopen::StartsWith(handle.GetDeviceName(), "gfx103")) + else if(miopen::StartsWith(deviceName, "gfx103")) { maxTotalSize = 128; // 128MB L3 } @@ -121,7 +115,6 @@ std::vector TensorsConfigs() { maxTotalSize = 4; // twice the 2MB L2, default case. } - maxTotalSize = maxTotalSize * 1024ull * 1024ull / sizeof(T); if constexpr(POW_2) @@ -246,7 +239,6 @@ struct Op4DTensorGenericTest std::multiplies()); bitmap = 0; - bitmap |= (1 << (tensorsConfig.blens.size() - d)); for(int i = (d - 2); i >= 0; i--) @@ -262,17 +254,9 @@ struct Op4DTensorGenericTest } } - // if(std::accumulate(tensorsConfig.blens.begin(), - // tensorsConfig.blens.end(), - // 4, - // std::multiplies()) == 1) - // { - // bitmap = 4; - // } - num_wg_orig = num_wg; - max_num_wg = 4096; - num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; + max_num_wg = 4096; + num_wg = num_wg > max_num_wg ? max_num_wg : num_wg; size_t local_threads = 256; @@ -289,7 +273,6 @@ struct Op4DTensorGenericTest void runCPU() { - std::vector A = tensA.data; std::vector B = tensB.data; std::vector C = tensC.data; @@ -374,7 +357,7 @@ struct Op4DTensorGenericTest static_cast(0), static_cast(0), static_cast(0), - static_cast(num_wg_orig)); + num_wg_orig); tensC_ocl.data = handle.Read(tensC_dev, tensC_ocl.data.size()); @@ -383,7 +366,6 @@ struct Op4DTensorGenericTest ph.perfTest(handle, "Op4dTensorGeneric", network_config_ocl, - false, tensA_dev.get(), static_cast(tensorsConfig.acstrides[0]), static_cast(tensorsConfig.acstrides[1]), @@ -410,7 +392,7 @@ struct Op4DTensorGenericTest static_cast(0), static_cast(0), static_cast(0), - static_cast(num_wg_orig)); + num_wg_orig); } } @@ -461,7 +443,7 @@ struct Op4DTensorGenericTest static_cast(0), static_cast(0), static_cast(0), - static_cast(num_wg_orig)); + num_wg_orig); tensC_hip.data = handle.Read(tensC_dev, tensC_hip.data.size()); @@ -470,7 +452,6 @@ struct Op4DTensorGenericTest ph.perfTest(handle, "Op4dTensorGeneric", network_config_hip, - false, tensA_dev.get(), static_cast(tensorsConfig.acstrides[0]), static_cast(tensorsConfig.acstrides[1]), @@ -497,7 +478,7 @@ struct Op4DTensorGenericTest static_cast(0), static_cast(0), static_cast(0), - static_cast(num_wg_orig)); + num_wg_orig); } } @@ -510,7 +491,7 @@ struct Op4DTensorGenericTest void verifyCPU() { auto error = miopen::rms_range(tensC_hip, tensC_cpu); - EXPECT_TRUE(error == 0) << "GPU outputs do not match each other. Error: " << error; + EXPECT_TRUE(error == 0) << "GPU outputs do not match CPU results. Error: " << error; } void TearDown() override @@ -537,7 +518,7 @@ struct Op4DTensorGenericTest stats += "_alpha0_" + std::to_string(alpha0) + "_alpha1_" + std::to_string(alpha1) + "_beta_" + std::to_string(beta) + "_" + miopen::GetDataType(data_type); - ph.writeStatsToCSV("tensor_4d.csv", stats); + ph.writeStatsToCSV("tensor_4d_generic.csv", stats); } } @@ -545,9 +526,7 @@ struct Op4DTensorGenericTest std::string params{}; std::vector vld, vgd; unsigned int bitmap; - int work_per_wg; - int num_wg_orig; - int max_num_wg; + int work_per_wg, num_wg_orig, max_num_wg; tensor tensA; tensor tensB; @@ -565,7 +544,7 @@ struct Op4DTensorGenericTest TensorsConfig tensorsConfig; float alpha0, alpha1, beta; - PerfHelper ph; + PerfHelper ph; }; struct GPU_Op4dTensorGenericTest_FP32 : Op4DTensorGenericTest @@ -574,12 +553,12 @@ struct GPU_Op4dTensorGenericTest_FP32 : Op4DTensorGenericTest TEST_P(GPU_Op4dTensorGenericTest_FP32, PortTest) { + // run CPU implementation +// runCPU(); // run OCL kernel runOCL(); // run HIP kernel runHIP(); - // run CPU implementation - // runCPU(); // verify if the output tensors are same verify(); }