diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp index 2626b00b700..513b5552f24 100644 --- a/projects/miopen/src/include/miopen/conv/solvers.hpp +++ b/projects/miopen/src/include/miopen/conv/solvers.hpp @@ -4556,11 +4556,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; + float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 8968230e6b4..751bd964c9d 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -40,6 +40,7 @@ #endif #include MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) +MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE); namespace miopen { namespace solver { @@ -360,7 +361,69 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescrip FillValidKernelsIDs, CKArgs>(problem); break; } - index = 0; + index = 0; + + auto find_kernel = [&valid_kernels = std::as_const(valid_kernels)]( + const std::size_t& index, const std::string& kernel_id) -> std::size_t { + // Check if valid_kernels[index] equals kernel_id. + if(index < valid_kernels.size() && valid_kernels[index] == kernel_id) + return index; + + // Linear search for kernel_id in valid_kernels. + auto it = std::find(valid_kernels.begin(), valid_kernels.end(), kernel_id); + if(it != valid_kernels.end()) + return static_cast(it - valid_kernels.begin()); + + // Not found: return 0 + MIOPEN_LOG_E("Not found :" << index << "-" << kernel_id); + return 0; + }; + + // for BF16 and FP16 + index = env::value(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE); + if(index == 0 && problem.GetInChannels() > 8 && problem.GetGroupCount() == 1 && + problem.GetAlphaBetaCase() == DEFAULT) + { + int K = problem.GetOutChannels(); + if(problem.GetInDataType() == miopenBFloat16) + { + if(K < 64) + { + index = + find_kernel(38, + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" + "<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, " + "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); + } + else + { + index = + find_kernel(30, + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" + "<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, " + "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); + } + } + else if(problem.GetInDataType() == miopenHalf) + { + if(K < 64) + { + index = + find_kernel(57, + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" + "<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, " + "BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>"); + } + else + { + index = + find_kernel(31, + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" + "<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, " + "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); + } + } + } kernel_id = valid_kernels[index]; } @@ -425,6 +488,11 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue( { HeuristicInit(problem); assert(!valid_kernels.empty()); + if(index != 0) + { + index = 0; + kernel_id = valid_kernels[index]; + } return true; } if((index + 1) < valid_kernels.size()) @@ -540,6 +608,34 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( return false; } +float ConvHipImplicitGemm3DGroupFwdXdlops::GetWti( + const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const +{ + decltype(auto) xDesc = problem.GetIn(); + decltype(auto) wDesc = problem.GetWeights(); + + if(xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16) + { + std::size_t in_n, in_c, w_x, w_y, w_d; + std::tie(in_n, in_c) = tie_pick<0, 1>()(xDesc.GetLengths()); + std::tie(w_x, w_y, w_d) = tie_pick<2, 3, 4>()(wDesc.GetLengths()); + // For cases where the filter shape is not 1x1x1 and the input channel (in_c) is greater + // than 8, CK's implementation offers better performance. + if((w_x == 1 && w_y == 1 && w_d == 1) == false) + { + if(in_c < 8 && in_n < 4) + { + return 0.00002; // force disable + } + else + { + return 1.0; // force enable + } + } + } + return 0.02f; +} + ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, diff --git a/projects/miopen/test/gtest/perf_config_HipImplicitGemm3DGroupFwdXdlops.cpp b/projects/miopen/test/gtest/perf_config_HipImplicitGemm3DGroupFwdXdlops.cpp new file mode 100644 index 00000000000..d99fd6fbe26 --- /dev/null +++ b/projects/miopen/test/gtest/perf_config_HipImplicitGemm3DGroupFwdXdlops.cpp @@ -0,0 +1,83 @@ +#include +#include + +#include +#include +#include +#include + +using Problem = miopen::conv::ProblemDescription; +using Config = miopen::solver::conv::PerformanceConfigHipImplicitGemm3DGroupFwdXdlops; + +struct PerfConfigTestCase +{ + struct group_conv::GroupConvTestConfig<3u> conv; + miopenDataType_t data_type; + miopenTensorLayout_t layout; + std::string arch; +}; + +std::vector GetPerfConfigTestCases(miopenDataType_t data_type, std::string arch) +{ + return {{{1, 128, 64, 32, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, + data_type, + miopenTensorNCDHW, + arch}, + {{1, 128, 64, 192, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, + data_type, + miopenTensorNCDHW, + arch}}; +} + +template +class PerfConfig_HipImplicitGemm3DGroupFwdXdlops + : public ::testing::TestWithParam +{ +protected: + void TestConfigs() + { + auto test_case = GetParam(); + + auto&& handle = get_handle(); + miopen::ExecutionContext ctx(&handle); + if(test_case.arch != ctx.GetStream().GetDeviceName()) + GTEST_SKIP(); + + auto input_tensor_desc = + miopen::TensorDescriptor(test_case.data_type, test_case.conv.GetInput()); + + auto weights_tensor_desc = miopen::TensorDescriptor( + test_case.data_type, test_case.layout, test_case.conv.GetWeights()); + + auto conv_desc = test_case.conv.GetConv(); + + auto output_desc = conv_desc.GetForwardOutputTensor( + input_tensor_desc, weights_tensor_desc, test_case.data_type); + + auto problem = miopen::conv::ProblemDescription(input_tensor_desc, + weights_tensor_desc, + output_desc, + conv_desc, + miopen::conv::Direction::Forward); + + Config cfg; + cfg.HeuristicInit(problem); + EXPECT_TRUE(cfg.index != 0) << "index is 0:" << test_case.conv; + } +}; + +using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16 = + PerfConfig_HipImplicitGemm3DGroupFwdXdlops; +using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16 = + PerfConfig_HipImplicitGemm3DGroupFwdXdlops; + +TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, All) { TestConfigs(); } +TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, All) { TestConfigs(); } + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, + testing::ValuesIn(GetPerfConfigTestCases(miopenBFloat16, "gfx942"))); + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, + testing::ValuesIn(GetPerfConfigTestCases(miopenHalf, "gfx942")));