Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6dc0b7f
TMP: run with CK split k autodeduce fix
JonathanLichtnerAMD Aug 27, 2025
ea14cb6
[MIOpen] Add -1 to the list of splitk values to iterate over
JonathanLichtnerAMD Aug 28, 2025
86e2673
[MIOpen] Get the workspace size from CK for grp wrw, grp bwd, and 2D …
JonathanLichtnerAMD Sep 2, 2025
4179753
[MIOpen] Add missing include for implicitgemm_util.hp
JonathanLichtnerAMD Sep 2, 2025
136c250
Fix no-CK builds
JonathanLichtnerAMD Sep 3, 2025
485cc5f
Size_t is unsigned, so it's weird to pass -1 to it
JonathanLichtnerAMD Sep 3, 2025
61310a0
Ensure workspace is calculated and passed correctly in factory invokers
JonathanLichtnerAMD Sep 4, 2025
a92915c
Revert "TMP: run with CK split k autodeduce fix"
JonathanLichtnerAMD Sep 4, 2025
371f2d6
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 4, 2025
015b286
Fix unchecked access to optional value tidy error
JonathanLichtnerAMD Sep 4, 2025
b3ee43d
Remove invalid check in generic search
JonathanLichtnerAMD Sep 5, 2025
a708a6b
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 8, 2025
df40ddb
Update changelog to mention improved selection of CK kernels during t…
JonathanLichtnerAMD Sep 16, 2025
1b9c417
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 17, 2025
12a456c
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 18, 2025
a8728f7
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 24, 2025
be4ac13
Fix issue with split_k autodeduce being the default
JonathanLichtnerAMD Sep 24, 2025
0372af3
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Sep 29, 2025
88acd54
Do not use CK's AutoDeduce with bwd
JonathanLichtnerAMD Sep 29, 2025
672655f
Merge branch 'develop' into users/jlichtne/optimize-ck-solvers
JonathanLichtnerAMD Oct 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions projects/miopen/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/proj
### Optimized

* [Conv] Enabled Composable Kernel (CK) implicit gemms on gfx950.
* [Conv] Improve Composable Kernel (CK) kernel selection during tuning

### Resolved issues

Expand Down
8 changes: 8 additions & 0 deletions projects/miopen/src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4644,6 +4644,10 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final
private:
template <typename DataType>
bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;

template <typename DataType>
std::size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription&) const;
size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const;
};

struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
Expand Down Expand Up @@ -4816,6 +4820,8 @@ struct ConvHipImplicitGemmGroupBwdXdlops final
private:
template <typename DataType>
bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;

size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const;
};

struct PerformanceConfigHipImplicitGemmGroupWrwXdlops
Expand Down Expand Up @@ -4910,6 +4916,8 @@ struct ConvHipImplicitGemmGroupWrwXdlops final
private:
template <typename DataType>
bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;

size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const;
};

} // namespace conv
Expand Down
9 changes: 0 additions & 9 deletions projects/miopen/src/include/miopen/generic_search.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,15 +553,6 @@ auto GenericSearch(const Solver s,

try
{
if(default_solution.workspace_sz != current_solution.workspace_sz)
{
ret = -2;
MIOPEN_LOG_E('#' << n_current << " (" << n_runs_total << ") "
<< "Workspace size should not depend on PerformanceConfig: "
<< default_solution.workspace_sz
<< " != " << current_solution.workspace_sz);
}

invoker = profile_h.PrepareInvoker(*current_solution.invoker_factory,
current_solution.construction_params);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <miopen/tensor_ops.hpp>
#include <miopen/miopen_internal.h>
#include <miopen/fusion/fusion_invoke_params.hpp>
#include <miopen/solver/implicitgemm_util.hpp>

#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
#include <ck/utility/data_type.hpp>
Expand All @@ -50,7 +51,30 @@ struct ProblemDescription;
} // namespace conv

namespace solver {

static constexpr int CkSplitkAutoDeduce = -1;

template <int L, int H>
inline static bool NextCKSplitkValue(int& v)
{
assert((IsTwoPower<L, H>(v) || v == CkSplitkAutoDeduce));
if(v == H)
{
v = CkSplitkAutoDeduce;
return false;
}
if(v == CkSplitkAutoDeduce)
{
v = L;
return true;
}

v *= 2;
return false;
}

#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL

namespace conv {
template <typename DataType>
using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<
Expand Down Expand Up @@ -300,6 +324,33 @@ bool IsCKApplicable(const ProblemDescriptionType& problem)
ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); });
}

template <typename DeviceOpType,
typename CKArgsType,
typename ProblemDescriptionType = miopen::conv::ProblemDescription>
size_t GetCKSplitkMaxWorkspaceSize(const ProblemDescriptionType& problem)
{
const auto args = CKArgsType{problem};
auto max_workspace_size = 0;

const auto ptrs = DeviceOpType::GetInstances();
for(auto& ptr : ptrs)
{
auto split_k = CkSplitkAutoDeduce;
do
{
if(args.IsSupportedBySplitK(ptr, split_k))
{
auto workspace_size = args.GetCKSplitkWorkspaceSize(ptr, split_k);
if(workspace_size > max_workspace_size)
max_workspace_size = workspace_size;
}
} while(!NextCKSplitkValue<1, 128>(split_k));
}

MIOPEN_LOG_I("Max workspace size reported by CK: " << max_workspace_size);
return max_workspace_size;
}

#define WORKAROUND_CK_ISSUE_1184 1
#if WORKAROUND_CK_ISSUE_1184
using WorkAroundHipEventProfiler = HipEventProfiler;
Expand Down Expand Up @@ -744,13 +795,14 @@ inline bool CKWrwRequireWorkspace(
}

/// \todo move to a cpp file
inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDescription& problem)
inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDescription& problem,
size_t ck_ws_size = 0)
{
if(problem.IsLayoutNHWC())
{
if(problem.GetDirection() == ::miopen::conv::Direction::BackwardWeights)
{
return GetCKAlphaBetaWorkspace(problem);
return (ck_ws_size > 0) ? ck_ws_size : GetCKAlphaBetaWorkspace(problem);
}
return 0;
}
Expand All @@ -759,10 +811,11 @@ inline size_t GetWorkspaceSizeLayoutTransformConv(const miopen::conv::ProblemDes

if(problem.GetDirection() == ::miopen::conv::Direction::BackwardWeights)
{
MultiBufferWorkspaceTraits wt({GetPackedSize(problem.GetIn()),
GetPackedSize(problem.GetWeights()),
GetPackedSize(problem.GetOut()),
GetCKAlphaBetaWorkspace(problem)});
MultiBufferWorkspaceTraits wt(
{GetPackedSize(problem.GetIn()),
GetPackedSize(problem.GetWeights()),
GetPackedSize(problem.GetOut()),
(ck_ws_size > 0) ? ck_ws_size : GetCKAlphaBetaWorkspace(problem)});
return wt.GetSize();
}

Expand Down Expand Up @@ -1079,18 +1132,21 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,

std::optional<CKBWDWeightBufferDescriptor> _ck_buff_des;

if(problem.IsDirectionBackwardWrW())
{
_ck_buff_des.emplace(GetCKAlphaBetaWorkspace(problem), 0);
}

auto ptr_iter = FindConvPtrByID(conv_ptrs, id_string);
if(ptr_iter == conv_ptrs.end())
{
MIOPEN_LOG_E("PerformanceConfig kernel '" + kernel_id + "' does not exist.");
return {miopenStatusInvalidValue};
}

if constexpr(std::is_same_v<CastType, miopen::conv::WrWInvokeParams>) {
auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value_or(1));
_ck_buff_des.emplace(ck_ws_size, 0);
result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size);
} else {
result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem);
}

auto [_input1_tr_inst, _input2_tr_inst, _output_tr_inst, _output_init_tr_inst] =
internal::MakeTaggedTransposeInstances<CKArgsType>(
result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des);
Expand Down Expand Up @@ -1197,8 +1253,6 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
output_tr_inst.ConvertTo(handle, kernels, conv_tensors);
};
};

result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem);
#endif
return result;
}
Expand Down Expand Up @@ -1235,8 +1289,9 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
ConvSolution result;
#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
miopenAlphaBetaCase_t alpha_beta_case = problem.GetAlphaBetaCase();
[[maybe_unused]] bool should_allocated_wrw_buffer =
ShouldAllocateWorkSpaceBufferForWRW(problem);
auto ck_args = CKArgsType{problem};
auto ck_ws_size = ck_args.GetCKSplitkWorkspaceSize(*ptr_iter, split_k.value_or(1));
[[maybe_unused]] bool should_allocated_wrw_buffer = ck_ws_size > 0;

result.invoker_factory = [kernel_id = kernel_id,
split_k = split_k,
Expand Down Expand Up @@ -1297,7 +1352,7 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
}
};
};
result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem);
result.workspace_sz = GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size);
#endif
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,29 +250,32 @@ struct CKArgs
template <typename ConvPtr>
bool IsSupportedBy(const ConvPtr& conv_ptr) const
{
auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1);
// Creat dummy workspace to pass the ck IsSupportedArgument check.

int dummy_var = 1;
conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var);

auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, 1);
auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get());
if(workspace_size != 0)
conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size);
return conv_ptr->IsSupportedArgument(arg_ptr.get());
}

template <typename ConvPtr>
bool IsSupportedBySplitK(const ConvPtr& conv_ptr, int split_k) const
{
auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k);

if(CKWrwRequireWorkspace(G, C1, K1, data_type, alpha_beta_case))
auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k);
auto workspace_size = conv_ptr->GetWorkSpaceSize(arg_ptr.get());
if(workspace_size != 0)
{
// Creat dummy workspace to pass the ck IsSupportedArgument check.
int dummy_var = 1;
conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &dummy_var);
conv_ptr->SetWorkSpacePointer(arg_ptr.get(), &workspace_size);
}
return conv_ptr->IsSupportedArgument(arg_ptr.get());
}

template <typename ConvPtr>
std::size_t GetCKSplitkWorkspaceSize(const ConvPtr& conv_ptr, int split_k) const
{
auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f, split_k);
return conv_ptr->GetWorkSpaceSize(arg_ptr.get());
}

int G;
int N;
int K;
Expand Down Expand Up @@ -398,7 +401,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::SetNextValue(
}
do
{
bool flag = NextTwoPower<1, 128>(split_k);
bool flag = NextCKSplitkValue<1, 128>(split_k);
if(!flag)
{
kernel_id = valid_kernels[index] + "+" + std::to_string(split_k);
Expand Down Expand Up @@ -465,11 +468,54 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig(
return config.IsValid(problem);
}

template <typename DataType>
size_t
ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const
{
#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
switch(problem.GetAlphaBetaCase())
{
case BILINEAR:
return GetCKSplitkMaxWorkspaceSize<DeviceOpGBwdWeightBilinearPtrs<DataType>,
CKArgs<DataType>>(problem);
case SCALE:
return GetCKSplitkMaxWorkspaceSize<DeviceOpGBwdWeightScalePtrs<DataType>, CKArgs<DataType>>(
problem);
default:
return GetCKSplitkMaxWorkspaceSize<DeviceOpGBwdWeightDefaultPtrs<DataType>,
CKArgs<DataType>>(problem);
}
#else
return 0;
#endif
}

size_t
ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize(const ProblemDescription& problem) const
{
#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
switch(problem.GetInDataType())
{
case miopenHalf: return GetCKMaxWorkspaceSize<ck::half_t>(problem);
case miopenFloat: return GetCKMaxWorkspaceSize<float>(problem);
case miopenInt8: return GetCKMaxWorkspaceSize<int8_t>(problem);
case miopenBFloat16: return GetCKMaxWorkspaceSize<ck::bhalf_t>(problem);
case miopenInt64:
case miopenInt32:
case miopenFloat8_fnuz:
case miopenBFloat8_fnuz:
case miopenDouble: break;
}
#endif
return 0; // other types not applicable for this solver
}

size_t
ConvHipImplicitGemm3DGroupWrwXdlops::GetWorkspaceSize(const ExecutionContext&,
const ProblemDescription& problem) const
{
return GetWorkspaceSizeLayoutTransformConv(problem);
auto ck_ws_size = GetCKMaxWorkspaceSize(problem);
return GetWorkspaceSizeLayoutTransformConv(problem, ck_ws_size);
}

PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
Expand Down
Loading