From 6dc0b7feae5bfbe90db97029015f28635e05e653 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Wed, 27 Aug 2025 23:16:02 +0000 Subject: [PATCH 01/13] TMP: run with CK split k autodeduce fix --- projects/miopen/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/miopen/requirements.txt b/projects/miopen/requirements.txt index e1298bb4729..f164e043420 100755 --- a/projects/miopen/requirements.txt +++ b/projects/miopen/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.22 ROCm/eigen@3.4.0 ROCm/frugally-deep@38c52448b1a4996b3e0e435a877d02441098b1dd -ROCm/composable_kernel@0db21053e68817a50b0ed0ceea87e88228ab2475 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@ca42cc99a5a3809f993cbdc351a43a74f3c60752 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From ea14cb6e26e891d72dbfb5b944c90ac05350781a Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Thu, 28 Aug 2025 23:55:55 +0000 Subject: [PATCH 02/13] [MIOpen] Add -1 to the list of splitk values to iterate over Prior to this change, splitk values could be the power of 2 values in the range 1 to 128, but CK has a special splitk autodeduce value of -1 that in some cases can provide better performance. This commit adds -1 to the set of splitk CK values, which improves the overall performance of CK solvers. --- .../miopen/solver/implicitgemm_ck_util.hpp | 22 +++++++++++++++++++ ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 6 ++--- ...v_hip_implicit_gemm_grouped_bwd_xdlops.cpp | 6 ++--- ...v_hip_implicit_gemm_grouped_wrw_xdlops.cpp | 6 ++--- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index cbb1fd60eb6..b751b4ff7cb 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -51,6 +51,28 @@ struct ProblemDescription; namespace solver { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +static constexpr int CkSplitkAutoDeduce = -1; + +template +inline static bool NextCKSplitkValue(int& v) +{ + assert((IsTwoPower(v) || v == CkSplitkAutoDeduce)); + if(v == H) + { + v = CkSplitkAutoDeduce; + return true; + } + if(v == CkSplitkAutoDeduce) + { + v = L; + return false; + } + + v *= 2; + return false; +} + namespace conv { template using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight< diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 0c15d775091..d78f2938b63 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -323,7 +323,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::Init(const ProblemDescrip break; } index = 0; - split_k = 1; + split_k = CkSplitkAutoDeduce; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -365,7 +365,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( [[maybe_unused]] const ProblemDescription& problem) { index = 0; - split_k = 1; + split_k = CkSplitkAutoDeduce; kernel_id = ""; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -398,7 +398,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::SetNextValue( } do { - bool flag = NextTwoPower<1, 128>(split_k); + bool flag = NextCKSplitkValue<1, 128>(split_k); if(!flag) { kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index 4533508e15b..00a57c0a7b5 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -216,7 +216,7 @@ void PerformanceConfigHipImplicitGemmGroupBwdXdlops::Init(const ProblemDescripti { valid_kernels = FillValidKernelsIDs, CKArgs>(problem); index = 0; - split_k = 1; + split_k = CkSplitkAutoDeduce; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -398,7 +398,7 @@ void PerformanceConfigHipImplicitGemmGroupBwdXdlops::HeuristicInit( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) { - split_k = 1; + split_k = CkSplitkAutoDeduce; index = 0; kernel_id = ""; @@ -460,7 +460,7 @@ bool PerformanceConfigHipImplicitGemmGroupBwdXdlops::SetNextValue(const ProblemD } do { - bool flag = NextTwoPower<1, 128>(split_k); + bool flag = NextCKSplitkValue<1, 128>(split_k); if(!flag) { kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp index f6a38370b43..858c8ab0df5 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp @@ -221,7 +221,7 @@ void PerformanceConfigHipImplicitGemmGroupWrwXdlops::Init(const ProblemDescripti { valid_kernels = FillValidKernelsIDs, CKArgs>(problem); index = 0; - split_k = 1; + split_k = CkSplitkAutoDeduce; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -438,7 +438,7 @@ void PerformanceConfigHipImplicitGemmGroupWrwXdlops::HeuristicInit( [[maybe_unused]] const ProblemDescription& problem) { // these seem redundant - split_k = 1; + split_k = CkSplitkAutoDeduce; index = 0; kernel_id = ""; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -499,7 +499,7 @@ bool PerformanceConfigHipImplicitGemmGroupWrwXdlops::SetNextValue(const ProblemD } do { - bool flag = NextTwoPower<1, 128>(split_k); + bool flag = NextCKSplitkValue<1, 128>(split_k); if(!flag) { kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); From 86e2673c0ba32e6038161991a926ffc7d3b1c1b5 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Tue, 2 Sep 2025 04:01:48 +0000 Subject: [PATCH 03/13] [MIOpen] Get the workspace size from CK for grp wrw, grp bwd, and 2D grp wrw MIOpen was setting the CK workspace size to zero if some instances did not require a workspace for a particular shape, but this precluded some CK instances that might require a workspace from even being considered. And in some cases, the CK solver performance was reduced because the CK instance that needed the workspace might actually have been the fastest instance for that solver. This commit changes MIOpen to get the workspace size from CK for the grp wrw, grp bwd, and 3D grp wrw solvers. This will boost CK performance for bwd and wrw, which will be needed for bwd and wrw fusions, since we need to make CK as performant as possible (since the fusions will use only CK instances). Note that the other CK solvers can be converted to get the workspace size directly from CK in the future. --- .../src/include/miopen/conv/solvers.hpp | 8 +++ .../miopen/solver/implicitgemm_ck_util.hpp | 45 +++++++++---- ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 66 +++++++++++++++---- ...v_hip_implicit_gemm_grouped_bwd_xdlops.cpp | 47 ++++++++++--- ...v_hip_implicit_gemm_grouped_wrw_xdlops.cpp | 53 ++++++++++----- 5 files changed, 170 insertions(+), 49 deletions(-) diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp index 5fa56cb8da1..f48ec8c7b46 100644 --- a/projects/miopen/src/include/miopen/conv/solvers.hpp +++ b/projects/miopen/src/include/miopen/conv/solvers.hpp @@ -4644,6 +4644,10 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final private: template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; + + template + std::size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription&) const; + size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const; }; struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops @@ -4816,6 +4820,8 @@ struct ConvHipImplicitGemmGroupBwdXdlops final private: template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; + + size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const; }; struct PerformanceConfigHipImplicitGemmGroupWrwXdlops @@ -4910,6 +4916,8 @@ struct ConvHipImplicitGemmGroupWrwXdlops final private: template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; + + size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const; }; } // namespace conv diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index b751b4ff7cb..2bd8d8cfb09 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -322,6 +322,32 @@ bool IsCKApplicable(const ProblemDescriptionType& problem) ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); }); } +template +size_t GetCKSplitkMaxWorkspaceSize(const ProblemDescriptionType& problem) +{ + const auto args = CKArgsType{problem}; + auto max_workspace_size = 0; + + const auto ptrs = DeviceOpType::GetInstances(); + for(auto& ptr : ptrs) + { + auto split_k = CkSplitkAutoDeduce; + do + { + if(args.IsSupportedBySplitK(ptr, split_k)) + { + auto workspace_size = args.GetCKSplitkWorkspaceSize(ptr, split_k); + if(workspace_size > max_workspace_size) + max_workspace_size = workspace_size; + } + } while(!NextCKSplitkValue<1, 128>(split_k)); + } + + return max_workspace_size; +} + #define WORKAROUND_CK_ISSUE_1184 1 #if WORKAROUND_CK_ISSUE_1184 using WorkAroundHipEventProfiler = HipEventProfiler; @@ -766,13 +792,14 @@ inline bool CKWrwRequireWorkspace( } /// \todo move to a cpp file -inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDescription& problem) +inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDescription& problem, + size_t ck_ws_size = -1) { if(problem.IsLayoutNHWC()) { if(problem.GetDirection() == ::miopen::conv::Direction::BackwardWeights) { - return GetCKAlphaBetaWorkspace(problem); + return (ck_ws_size == -1) ? GetCKAlphaBetaWorkspace(problem) : ck_ws_size; } return 0; } @@ -781,10 +808,11 @@ inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDes if(problem.GetDirection() == ::miopen::conv::Direction::BackwardWeights) { - MultiBufferWorkspaceTraits wt({GetPackedSize(problem.GetIn()), - GetPackedSize(problem.GetWeights()), - GetPackedSize(problem.GetOut()), - GetCKAlphaBetaWorkspace(problem)}); + MultiBufferWorkspaceTraits wt( + {GetPackedSize(problem.GetIn()), + GetPackedSize(problem.GetWeights()), + GetPackedSize(problem.GetOut()), + (ck_ws_size == -1) ? GetCKAlphaBetaWorkspace(problem) : ck_ws_size}); return wt.GetSize(); } @@ -1101,11 +1129,6 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, std::optional _ck_buff_des; - if(problem.IsDirectionBackwardWrW()) - { - _ck_buff_des.emplace(GetCKAlphaBetaWorkspace(problem), 0); - } - auto ptr_iter = FindConvPtrByID(conv_ptrs, id_string); if(ptr_iter == conv_ptrs.end()) { diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index d78f2938b63..10f20b402d2 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -250,29 +250,32 @@ struct CKArgs template bool IsSupportedBy(const ConvPtr& conv_ptr) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); - // Creat dummy workspace to pass the ck IsSupportedArgument check. - - int dummy_var = 1; - conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var); - + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); return conv_ptr->IsSupportedArgument(arg_ptr.get()); } template bool IsSupportedBySplitK(const ConvPtr& conv_ptr, int split_k) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); - - if(CKWrwRequireWorkspace(G, C1, K1, data_type, alpha_beta_case)) + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) { - // Creat dummy workspace to pass the ck IsSupportedArgument check. - int dummy_var = 1; - conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var); + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); } return conv_ptr->IsSupportedArgument(arg_ptr.get()); } + template + std::size_t GetCKSplitkWorkspaceSize(const ConvPtr& conv_ptr, int split_k) const + { + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + return conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + } + int G; int N; int K; @@ -465,11 +468,48 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig( return config.IsValid(problem); } +template +size_t +ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const +{ + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + return GetCKSplitkMaxWorkspaceSize, + CKArgs>(problem); + case SCALE: + return GetCKSplitkMaxWorkspaceSize, CKArgs>( + problem); + default: + return GetCKSplitkMaxWorkspaceSize, + CKArgs>(problem); + } +} + +size_t +ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const +{ + switch(problem.GetInDataType()) + { + case miopenHalf: return GetCKMaxWorkspaceSize(problem); + case miopenFloat: return GetCKMaxWorkspaceSize(problem); + case miopenInt8: return GetCKMaxWorkspaceSize(problem); + case miopenBFloat16: return GetCKMaxWorkspaceSize(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8_fnuz: + case miopenBFloat8_fnuz: + case miopenDouble: break; + } + return 0; // other types not applicable for this solver +} + size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { - return GetWorkspaceSizeLayoutTransformConv(problem); + auto ck_ws_size = GetCKMaxWorkspaceSize(problem); + return GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); } PerformanceConfigHipImplicitGemm3DGroupWrwXdlops diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index 00a57c0a7b5..7d41d7ce478 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -166,24 +166,30 @@ struct CKArgs template bool IsSupportedBy(const ConvPtr& conv_ptr) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); return conv_ptr->IsSupportedArgument(arg_ptr.get()); } template bool IsSupportedBySplitK(const ConvPtr& conv_ptr, int split_k) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); - - if(CKWrwRequireWorkspace(G, C1, K1, data_type, alpha_beta_case)) - { - // Creat dummy workspace to pass the ck IsSupportedArgument check. - int dummy_var = 1; - conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var); - } + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); return conv_ptr->IsSupportedArgument(arg_ptr.get()); } + template + std::size_t GetCKSplitkWorkspaceSize(const ConvPtr& conv_ptr, int split_k) const + { + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + return conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + } + int G; int N; int K; @@ -527,10 +533,31 @@ bool ConvHipImplicitGemmGroupBwdXdlops::IsValidPerformanceConfig( return config.IsValid(problem); } +size_t +ConvHipImplicitGemmGroupBwdXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const +{ + switch(problem.GetInDataType()) + { + case miopenHalf: + return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenFloat: return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenInt8: return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenBFloat16: + return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8_fnuz: + case miopenBFloat8_fnuz: + case miopenDouble: break; + } + return 0; // other types not applicable for this solver +} + size_t ConvHipImplicitGemmGroupBwdXdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { - return GetWorkspaceSizeLayoutTransformConv(problem); + auto ck_ws_size = GetCKMaxWorkspaceSize(problem); + return GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); } PerformanceConfigHipImplicitGemmGroupBwdXdlops diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp index 858c8ab0df5..b54437f3838 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp @@ -165,29 +165,30 @@ struct CKArgs template bool IsSupportedBy(const ConvPtr& conv_ptr) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); - // Creat dummy workspace to pass the ck IsSupportedArgument check. - - int dummy_var = 1; - conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var); - + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); return conv_ptr->IsSupportedArgument(arg_ptr.get()); } template bool IsSupportedBySplitK(const ConvPtr& conv_ptr, int split_k) const { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); - - if(CKWrwRequireWorkspace(G, C1, K1, data_type, alpha_beta_case)) - { - // Creat dummy workspace to pass the ck IsSupportedArgument check. - int dummy_var = 1; - conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var); - } + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + if(workspace_size != 0) + conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size); return conv_ptr->IsSupportedArgument(arg_ptr.get()); } + template + std::size_t GetCKSplitkWorkspaceSize(const ConvPtr& conv_ptr, int split_k) const + { + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k); + return conv_ptr->GetWorkSpaceSize(arg_ptr.get()); + } + int G; int N; int K; @@ -567,10 +568,32 @@ bool ConvHipImplicitGemmGroupWrwXdlops::IsValidPerformanceConfig( return config.IsValid(problem); } +size_t +ConvHipImplicitGemmGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const +{ + + switch(problem.GetInDataType()) + { + case miopenHalf: + return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenFloat: return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenInt8: return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenBFloat16: + return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8_fnuz: + case miopenBFloat8_fnuz: + case miopenDouble: break; + } + return 0; // other types not applicable for this solver +} + size_t ConvHipImplicitGemmGroupWrwXdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { - return GetWorkspaceSizeLayoutTransformConv(problem); + auto ck_ws_size = GetCKMaxWorkspaceSize(problem); + return GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); } PerformanceConfigHipImplicitGemmGroupWrwXdlops From 4179753b95ef80237d2a62aac6ad3d46738387c4 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Tue, 2 Sep 2025 22:01:07 +0000 Subject: [PATCH 04/13] [MIOpen] Add missing include for implicitgemm_util.hp --- .../miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index 2bd8d8cfb09..c75b885430e 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL #include From 136c2502c94a023049b53799b249b787da0535ca Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Wed, 3 Sep 2025 02:31:24 +0000 Subject: [PATCH 05/13] Fix no-CK builds --- .../src/include/miopen/solver/implicitgemm_ck_util.hpp | 3 ++- .../conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 6 ++++++ .../conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp | 2 ++ .../conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp | 3 ++- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index c75b885430e..3fcc337976b 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -51,7 +51,6 @@ struct ProblemDescription; } // namespace conv namespace solver { -#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL static constexpr int CkSplitkAutoDeduce = -1; @@ -74,6 +73,8 @@ inline static bool NextCKSplitkValue(int& v) return false; } +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + namespace conv { template using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight< diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 10f20b402d2..fe00589af36 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -472,6 +472,7 @@ template size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL switch(problem.GetAlphaBetaCase()) { case BILINEAR: @@ -484,11 +485,15 @@ ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescript return GetCKSplitkMaxWorkspaceSize, CKArgs>(problem); } +#else + return 0; +#endif } size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL switch(problem.GetInDataType()) { case miopenHalf: return GetCKMaxWorkspaceSize(problem); @@ -501,6 +506,7 @@ ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescript case miopenBFloat8_fnuz: case miopenDouble: break; } +#endif return 0; // other types not applicable for this solver } diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index 7d41d7ce478..3506e719a3c 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -536,6 +536,7 @@ bool ConvHipImplicitGemmGroupBwdXdlops::IsValidPerformanceConfig( size_t ConvHipImplicitGemmGroupBwdXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL switch(problem.GetInDataType()) { case miopenHalf: @@ -550,6 +551,7 @@ ConvHipImplicitGemmGroupBwdXdlops::GetCKMaxWorkspaceSize(const ProblemDescriptio case miopenBFloat8_fnuz: case miopenDouble: break; } +#endif return 0; // other types not applicable for this solver } diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp index b54437f3838..6d6df63520f 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp @@ -571,7 +571,7 @@ bool ConvHipImplicitGemmGroupWrwXdlops::IsValidPerformanceConfig( size_t ConvHipImplicitGemmGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const { - +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL switch(problem.GetInDataType()) { case miopenHalf: @@ -586,6 +586,7 @@ ConvHipImplicitGemmGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescriptio case miopenBFloat8_fnuz: case miopenDouble: break; } +#endif return 0; // other types not applicable for this solver } From 485cc5f58243d5291041fcd68863b644718dee8b Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Wed, 3 Sep 2025 20:20:24 +0000 Subject: [PATCH 06/13] Size_t is unsigned, so it's weird to pass -1 to it Use 0 as the default and then use the ck workspace size value if it is non-zero, otherwise we fallback to GetCKAlphaBetaWorkspace(). Note that this can be cleaned up in the future when we convert all the CK solvers to get the workspace size from CK. --- .../src/include/miopen/solver/implicitgemm_ck_util.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index 3fcc337976b..cb135632a6c 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -795,13 +795,13 @@ inline bool CKWrwRequireWorkspace( /// \todo move to a cpp file inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDescription& problem, - size_t ck_ws_size = -1) + size_t ck_ws_size = 0) { if(problem.IsLayoutNHWC()) { if(problem.GetDirection() == ::miopen::conv::Direction::BackwardWeights) { - return (ck_ws_size == -1) ? GetCKAlphaBetaWorkspace(problem) : ck_ws_size; + return (ck_ws_size > 0) ? ck_ws_size : GetCKAlphaBetaWorkspace(problem); } return 0; } @@ -814,7 +814,7 @@ inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDes {GetPackedSize(problem.GetIn()), GetPackedSize(problem.GetWeights()), GetPackedSize(problem.GetOut()), - (ck_ws_size == -1) ? GetCKAlphaBetaWorkspace(problem) : ck_ws_size}); + (ck_ws_size > 0) ? ck_ws_size : GetCKAlphaBetaWorkspace(problem)}); return wt.GetSize(); } From 61310a0443a164646a82264a48e24fc2a520e335 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:27:24 +0000 Subject: [PATCH 07/13] Ensure workspace is calculated and passed correctly in factory invokers This change fixes some broken unit tests, and addresses an issue with the resulting workspace size not being reported correctly. --- .../miopen/solver/implicitgemm_ck_util.hpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index cb135632a6c..cd5c07a97ed 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -347,6 +347,7 @@ size_t GetCKSplitkMaxWorkspaceSize(const ProblemDescriptionType& problem) } while(!NextCKSplitkValue<1, 128>(split_k)); } + MIOPEN_LOG_I("Max workspace size reported by CK: " << max_workspace_size); return max_workspace_size; } @@ -1138,6 +1139,14 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, return {miopenStatusInvalidValue}; } + if constexpr(std::is_same_v) { + auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value()); + _ck_buff_des.emplace(ck_ws_size, 0); + result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); + } else { + result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem); + } + auto [_input1_tr_inst, _input2_tr_inst, _output_tr_inst, _output_init_tr_inst] = internal::MakeTaggedTransposeInstances( result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des); @@ -1244,8 +1253,6 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, output_tr_inst.ConvertTo(handle, kernels, conv_tensors); }; }; - - result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem); #endif return result; } @@ -1282,8 +1289,9 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, ConvSolution result; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL miopenAlphaBetaCase_t alpha_beta_case = problem.GetAlphaBetaCase(); - [[maybe_unused]] bool should_allocated_wrw_buffer = - ShouldAllocateWorkSpaceBufferForWRW(problem); + auto ck_args = CKArgsType{problem}; + auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value_or(1)); + [[maybe_unused]] bool should_allocated_wrw_buffer = ck_ws_size > 0; result.invoker_factory = [kernel_id = kernel_id, split_k = split_k, @@ -1344,7 +1352,7 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, } }; }; - result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem); + result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); #endif return result; } From a92915c01d9f233541f02b93f80f2d1c2f1dba2d Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:38:19 +0000 Subject: [PATCH 08/13] Revert "TMP: run with CK split k autodeduce fix" This reverts commit 6dc0b7feae5bfbe90db97029015f28635e05e653. --- projects/miopen/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/miopen/requirements.txt b/projects/miopen/requirements.txt index f164e043420..e1298bb4729 100755 --- a/projects/miopen/requirements.txt +++ b/projects/miopen/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.22 ROCm/eigen@3.4.0 ROCm/frugally-deep@38c52448b1a4996b3e0e435a877d02441098b1dd -ROCm/composable_kernel@ca42cc99a5a3809f993cbdc351a43a74f3c60752 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@0db21053e68817a50b0ed0ceea87e88228ab2475 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From 015b286fe9613d69194d86ecd98833bad4f62dcc Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Thu, 4 Sep 2025 22:32:18 +0000 Subject: [PATCH 09/13] Fix unchecked access to optional value tidy error --- .../miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index cd5c07a97ed..dbd3a2f391d 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -1140,7 +1140,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, } if constexpr(std::is_same_v) { - auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value()); + auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value_or(1)); _ck_buff_des.emplace(ck_ws_size, 0); result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size); } else { From b3ee43d11e3ccfd96a0abb784171429c454662a8 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Fri, 5 Sep 2025 21:06:49 +0000 Subject: [PATCH 10/13] Remove invalid check in generic search The check implied that a particular solver had to use the same workspace size for all the different kernels and parameters, but CK can have some kernels that have non-zero workspace size while others have zero workspace size for the same problem. --- projects/miopen/src/include/miopen/generic_search.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/projects/miopen/src/include/miopen/generic_search.hpp b/projects/miopen/src/include/miopen/generic_search.hpp index 35426fd35fc..e922a23caf0 100644 --- a/projects/miopen/src/include/miopen/generic_search.hpp +++ b/projects/miopen/src/include/miopen/generic_search.hpp @@ -553,15 +553,6 @@ auto GenericSearch(const Solver s, try { - if(default_solution.workspace_sz != current_solution.workspace_sz) - { - ret = -2; - MIOPEN_LOG_E('#' << n_current << " (" << n_runs_total << ") " - << "Workspace size should not depend on PerformanceConfig: " - << default_solution.workspace_sz - << " != " << current_solution.workspace_sz); - } - invoker = profile_h.PrepareInvoker(*current_solution.invoker_factory, current_solution.construction_params); From df40ddb948d58bf36daa83bab7b8b58938608af1 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Tue, 16 Sep 2025 01:05:57 +0000 Subject: [PATCH 11/13] Update changelog to mention improved selection of CK kernels during tuning --- projects/miopen/CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/miopen/CHANGELOG.md b/projects/miopen/CHANGELOG.md index 8c12ca669a7..a51261117ad 100644 --- a/projects/miopen/CHANGELOG.md +++ b/projects/miopen/CHANGELOG.md @@ -2,6 +2,10 @@ # Change Log for MIOpen Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/projects/MIOpen/en/latest/) +## MIOpen 3.5.0 for ROCm 7.1.0 +### Optimized +* [Conv] Improve Composable Kernel (CK) kernel selection during tuning + ## MIOpen 3.5.0 for ROCm 7.0.0 ### Added * [Conv] Added misa kernels for gfx950 From be4ac13e598a56abd137bd9092a47eb452def7a8 Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Wed, 24 Sep 2025 21:40:17 +0000 Subject: [PATCH 12/13] Fix issue with split_k autodeduce being the default This commit changes the default split_k settings to start at index 0 and split_k 1 rather than -1 (AutoDeduce). The problem with starting at -1 meant that this was the default config, and at the end of the generic search MIOpen runs a final run on the default config *but* this might not be a valid config for CK (not applicable). This would cause the the tuning to fail and would also cause a failure in immediate mode. The solution is to restore the defaults to their original values (index 0, split_k 1) and then iterate through the split_k values from 1, ..., 128, -1 and then wrap to the next index. --- .../miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp | 4 ++-- .../conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 4 ++-- .../solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp | 4 ++-- .../solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index dbd3a2f391d..a58e04ce4dd 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -61,12 +61,12 @@ inline static bool NextCKSplitkValue(int& v) if(v == H) { v = CkSplitkAutoDeduce; - return true; + return false; } if(v == CkSplitkAutoDeduce) { v = L; - return false; + return true; } v *= 2; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index fe00589af36..c17dec86841 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -326,7 +326,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::Init(const ProblemDescrip break; } index = 0; - split_k = CkSplitkAutoDeduce; + split_k = 1; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -368,7 +368,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( [[maybe_unused]] const ProblemDescription& problem) { index = 0; - split_k = CkSplitkAutoDeduce; + split_k = 1; kernel_id = ""; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index 3506e719a3c..f96d64fabd7 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -222,7 +222,7 @@ void PerformanceConfigHipImplicitGemmGroupBwdXdlops::Init(const ProblemDescripti { valid_kernels = FillValidKernelsIDs, CKArgs>(problem); index = 0; - split_k = CkSplitkAutoDeduce; + split_k = 1; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -404,7 +404,7 @@ void PerformanceConfigHipImplicitGemmGroupBwdXdlops::HeuristicInit( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) { - split_k = CkSplitkAutoDeduce; + split_k = 1; index = 0; kernel_id = ""; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp index 6d6df63520f..a0aacc56506 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp @@ -222,7 +222,7 @@ void PerformanceConfigHipImplicitGemmGroupWrwXdlops::Init(const ProblemDescripti { valid_kernels = FillValidKernelsIDs, CKArgs>(problem); index = 0; - split_k = CkSplitkAutoDeduce; + split_k = 1; kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } @@ -439,7 +439,7 @@ void PerformanceConfigHipImplicitGemmGroupWrwXdlops::HeuristicInit( [[maybe_unused]] const ProblemDescription& problem) { // these seem redundant - split_k = CkSplitkAutoDeduce; + split_k = 1; index = 0; kernel_id = ""; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL From 88acd541cdbd27a8692c1b2ea8a94d76c8ce4f8a Mon Sep 17 00:00:00 2001 From: Jonathan Lichtner <195780826+JonathanLichtnerAMD@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:49:27 +0000 Subject: [PATCH 13/13] Do not use CK's AutoDeduce with bwd --- .../solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index f96d64fabd7..5dc6e154021 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -466,7 +466,7 @@ bool PerformanceConfigHipImplicitGemmGroupBwdXdlops::SetNextValue(const ProblemD } do { - bool flag = NextCKSplitkValue<1, 128>(split_k); + bool flag = NextTwoPower<1, 128>(split_k); if(!flag) { kernel_id = valid_kernels[index] + "+" + std::to_string(split_k);