Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
49a55cb
conv:tf32:add all instances
yingluAMD Nov 18, 2025
8e582f0
refact 3d grouped instances code
yingluAMD Nov 18, 2025
23cabd1
add 3dGrouped Bwd/Wrw unitests
yingluAMD Nov 18, 2025
27dd838
add instances for conv grouped(f/b/w)
yingluAMD Nov 19, 2025
ecdcbcb
fix clang-format
yingluAMD Nov 19, 2025
c6bd09e
fix clang-format
yingluAMD Nov 19, 2025
10944e4
fix clang-format
yingluAMD Nov 19, 2025
3b0d76c
fix word errors
yingluAMD Nov 19, 2025
ab5012c
fix word errors
yingluAMD Nov 19, 2025
c2c6043
disable ck bf16/tf32 cases on gfx90a
yingluAMD Nov 20, 2025
0efc849
disable ck bf16/tf32 cases on gfx90a
yingluAMD Nov 20, 2025
d592d71
disable ck bf16/tf32 cases on gfx90a
yingluAMD Nov 20, 2025
8acadb0
code review
yingluAMD Nov 24, 2025
7d72398
Merge branch 'develop' into tf32_instances
yingluAMD Dec 2, 2025
12c5bd4
Revert "Revert "MIOpen:feature:tf32:demonstrate tf32 in conv3d on MI3…
yingluAMD Dec 2, 2025
0b07e5b
bug fix
yingluAMD Dec 2, 2025
3cb1083
bug fix
yingluAMD Dec 2, 2025
bb5d33f
add gfx950 as supported device
yingluAMD Dec 2, 2025
496c639
fix clang-format fail
yingluAMD Dec 2, 2025
c7f1d1b
Merge branch 'develop' into tf32_instances
yingluAMD Dec 11, 2025
008d648
refine unit solver test
yingluAMD Dec 12, 2025
6b2d26b
fix review comments
yingluAMD Dec 12, 2025
786dba9
refeine test code
yingluAMD Dec 12, 2025
a309d7d
update jenkins file
yingluAMD Dec 14, 2025
0a9ce60
Merge branch 'develop' into tf32_instances
yingluAMD Dec 15, 2025
af1f31b
bug fix
yingluAMD Dec 15, 2025
3b7fd51
improment ci
yingluAMD Dec 15, 2025
9ada135
move IsTF32Supported to target properties.hpp
yingluAMD Dec 17, 2025
9c9de81
Merge commit 'f0ecbb525ec' into tf32_instances
yingluAMD Dec 19, 2025
e35745e
Merge branch 'develop' into tf32_instances
BradPepersAMD Dec 20, 2025
12393f2
change 3D test to new style
yingluAMD Dec 22, 2025
2339bd8
fix clang-format
yingluAMD Dec 22, 2025
422ce39
Merge branch 'develop' into tf32_instances
yingluAMD Dec 22, 2025
d74ecf1
decrease problem size to meet threshold
yingluAMD Dec 22, 2025
c9140ee
refine 3DGroupWrw test cases
yingluAMD Dec 22, 2025
0c97f59
Merge branch 'develop' into tf32_instances
yingluAMD Dec 23, 2025
c1bda69
fix applicability
yingluAMD Dec 23, 2025
303e360
Merge branch 'develop' into tf32_instances
yingluAMD Jan 12, 2026
8384fa5
fix merge issues
yingluAMD Jan 13, 2026
2230980
Merge branch 'develop' into tf32_instances
yingluAMD Jan 15, 2026
c0c687a
Merge branch 'develop' into tf32_instances
yingluAMD Jan 19, 2026
c381273
fix merge issues
yingluAMD Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion projects/miopen/driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ class ConvDriver : public Driver

{ // tf32 has same mantissa length as fp16
auto math_type_ = inflags.GetValueInt("math_type");
if(std::is_same_v<Tgpu, float> && (miopen::EnvEnableTF32() || math_type_))
if(std::is_same_v<Tgpu, float> && (miopen::EnvEnableTF32() || (math_type_ == 0)))
Comment thread
yingluAMD marked this conversation as resolved.
Outdated
Comment thread
yingluAMD marked this conversation as resolved.
Outdated
tolerance = 8.2e-3;
}
return tolerance;
Expand Down
28 changes: 19 additions & 9 deletions projects/miopen/src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4436,6 +4436,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops
MIOPEN_INTERNALS_EXPORT bool
IsModelApplicable(const ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem) const;
bool UseTF32() const { return use_tf32; }

private:
#if MIOPEN_ENABLE_AI_KERNEL_TUNING
Expand All @@ -4451,6 +4452,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops
void Init(const miopen::conv::ProblemDescription&);
template <typename DataType>
bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
mutable bool use_tf32 = false;
Comment thread
yingluAMD marked this conversation as resolved.
};

struct ConvHipImplicitGemmGroupFwdXdlops final
Expand Down Expand Up @@ -4603,13 +4605,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const;
MIOPEN_INTERNALS_EXPORT bool
operator==(const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& other) const;
bool UseTF32() const { return use_tf32; }

private:
template <typename DataType>
void Init(const miopen::conv::ProblemDescription&);
template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
bool Init(const miopen::conv::ProblemDescription&);
template <typename DataType, typename ComputeType = DataType>
bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
void InitValidKernels(const miopen::conv::ProblemDescription& problem);
mutable bool use_tf32 = false;
Comment thread
yingluAMD marked this conversation as resolved.
};

struct ConvHipImplicitGemm3DGroupWrwXdlops final
Expand Down Expand Up @@ -4649,10 +4653,10 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final
bool MayNeedWorkspace() const override { return true; }

private:
template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
std::size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription&) const;
size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const;
};
Expand Down Expand Up @@ -4687,13 +4691,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const;
MIOPEN_INTERNALS_EXPORT bool
operator==(const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& other) const;
bool UseTF32() const { return use_tf32; }

private:
template <typename DataType>
void Init(const miopen::conv::ProblemDescription&);
template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
bool Init(const miopen::conv::ProblemDescription&);
template <typename DataType, typename ComputeType = DataType>
bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
void InitValidKernels(const miopen::conv::ProblemDescription& problem);
mutable bool use_tf32 = false;
Comment thread
yingluAMD marked this conversation as resolved.
};

struct ConvHipImplicitGemm3DGroupBwdXdlops final
Expand Down Expand Up @@ -4733,7 +4739,7 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final
bool MayNeedWorkspace() const override { return true; }

private:
template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;
};

Expand Down Expand Up @@ -4771,6 +4777,7 @@ struct PerformanceConfigHipImplicitGemmGroupBwdXdlops
MIOPEN_INTERNALS_EXPORT bool
IsModelApplicable(const ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem) const;
bool UseTF32() const { return use_tf32; }

private:
#if MIOPEN_ENABLE_AI_KERNEL_TUNING
Expand All @@ -4789,6 +4796,7 @@ struct PerformanceConfigHipImplicitGemmGroupBwdXdlops
void Init(const miopen::conv::ProblemDescription&);
template <typename DataType>
bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
mutable bool use_tf32 = false;
Comment thread
yingluAMD marked this conversation as resolved.
};

struct ConvHipImplicitGemmGroupBwdXdlops final
Expand Down Expand Up @@ -4867,6 +4875,7 @@ struct PerformanceConfigHipImplicitGemmGroupWrwXdlops
MIOPEN_INTERNALS_EXPORT bool
IsModelApplicable(const ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem) const;
bool UseTF32() const { return use_tf32; }

private:
#if MIOPEN_ENABLE_AI_KERNEL_TUNING
Expand All @@ -4885,6 +4894,7 @@ struct PerformanceConfigHipImplicitGemmGroupWrwXdlops
void Init(const miopen::conv::ProblemDescription&);
template <typename DataType>
bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
mutable bool use_tf32 = false;
Comment thread
yingluAMD marked this conversation as resolved.
};

struct ConvHipImplicitGemmGroupWrwXdlops final
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ inline static bool NextCKSplitkValue(int& v)
#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL

namespace conv {
template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<
2,
ck::tensor_layout::convolution::NHWGC,
Expand All @@ -87,12 +87,13 @@ using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<
DataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
template <typename DataType>
using DeviceOpGWrwPtrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<DeviceOpGWrw<DataType>>;
ck::tensor_operation::element_wise::PassThrough,
ComputeType>;
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGWrwPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOpGWrw<DataType, ComputeType>>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwd = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<
2,
ck::tensor_layout::convolution::NHWGK,
Expand All @@ -105,11 +106,12 @@ using DeviceOpGBwd = ck::tensor_operation::device::DeviceGroupedConvBwdDataMulti
DataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
ck::tensor_operation::element_wise::PassThrough,
ComputeType>;

template <typename DataType>
using DeviceOpGBwdPtrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<DeviceOpGBwd<DataType>>;
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOpGBwd<DataType, ComputeType>>;

using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
Expand All @@ -118,7 +120,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using Scale = ck::tensor_operation::element_wise::Scale;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightDefault =
ck::tensor_operation::device::DeviceGroupedConvBwdWeight<3,
InLayout,
Expand All @@ -129,9 +131,10 @@ using DeviceOpGBwdWeightDefault =
DataType,
PassThrough,
PassThrough,
PassThrough>;
PassThrough,
ComputeType>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightBilinear =
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<3,
InLayout,
Expand All @@ -144,9 +147,10 @@ using DeviceOpGBwdWeightBilinear =
ck::Tuple<DataType>,
PassThrough,
Bilinear,
PassThrough>;
PassThrough,
ComputeType>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightScale =
ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<3,
InLayout,
Expand All @@ -159,22 +163,23 @@ using DeviceOpGBwdWeightScale =
ck::Tuple<>,
PassThrough,
Scale,
PassThrough>;
PassThrough,
ComputeType>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightDefaultPtrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOpGBwdWeightDefault<DataType>>;
DeviceOpGBwdWeightDefault<DataType, ComputeType>>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightBilinearPtrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOpGBwdWeightBilinear<DataType>>;
DeviceOpGBwdWeightBilinear<DataType, ComputeType>>;

template <typename DataType>
template <typename DataType, typename ComputeType = DataType>
using DeviceOpGBwdWeightScalePtrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOpGBwdWeightScale<DataType>>;
DeviceOpGBwdWeightScale<DataType, ComputeType>>;

} // namespace conv

Expand Down Expand Up @@ -241,27 +246,33 @@ std::vector<std::string> FillValidKernelsIDs(const ProblemDescriptionType& probl
}

#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
// TODO: whether split_k is needed for tf32?
Comment thread
yingluAMD marked this conversation as resolved.
Outdated
template <typename DeviceOpType>
inline constexpr bool IsSplitKNeeded()
{
return std::is_same_v<DeviceOpType, conv::DeviceOpGWrwPtrs<ck::half_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGWrwPtrs<float>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGWrwPtrs<float, ck::tf32_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGWrwPtrs<int8_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGWrwPtrs<ck::bhalf_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdPtrs<ck::half_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdPtrs<float>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdPtrs<float, ck::tf32_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdPtrs<int8_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdPtrs<ck::bhalf_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightDefaultPtrs<ck::half_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightDefaultPtrs<float>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightDefaultPtrs<float, ck::tf32_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightDefaultPtrs<int8_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightDefaultPtrs<ck::bhalf_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightBilinearPtrs<ck::half_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightBilinearPtrs<float>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightBilinearPtrs<float, ck::tf32_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightBilinearPtrs<int8_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightBilinearPtrs<ck::bhalf_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightScalePtrs<ck::half_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightScalePtrs<float>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightScalePtrs<float, ck::tf32_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightScalePtrs<int8_t>> ||
std::is_same_v<DeviceOpType, conv::DeviceOpGBwdWeightScalePtrs<ck::bhalf_t>>;
}
Expand Down
14 changes: 11 additions & 3 deletions projects/miopen/src/ocl/convolutionocl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,7 @@ std::size_t ConvolutionDescriptor::GetForwardSolutionWorkspaceSize(const Handle&
conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward};
auto ctx = ExecutionContext{};
ctx.SetStream(&handle);
problem.SetupComputeType(ctx);
if(sol.IsApplicable(ctx, problem))
return sol.GetWorkspaceSize(ctx, problem);
MIOPEN_THROW(miopenStatusBadParm,
Expand Down Expand Up @@ -1091,7 +1092,8 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(const Handle& handle,
ConvForwardCheckNumerics(handle, tensors, [&]() {
const auto problem =
conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward};
const auto ctx = ExecutionContext{&handle};
const auto ctx = ExecutionContext{&handle};
problem.SetupComputeType(ctx);
const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id);
const auto invoke_ctx = conv::DataInvokeParams{
tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetFwd()};
Expand Down Expand Up @@ -1216,6 +1218,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(const Handle& handle,
const auto problem = conv::ProblemDescription{
dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData, 0, alpha_val, beta_val};
ValidateAlphaBeta(problem);
problem.SetupComputeType(ExecutionContext{&handle});
Comment thread
yingluAMD marked this conversation as resolved.
Outdated

ConvBwdCheckNumerics(handle, tensors, beta, [&]() {
if(dyDesc.GetLengths()[1] != wDesc.GetLengths()[0])
Expand Down Expand Up @@ -1260,6 +1263,7 @@ std::size_t ConvolutionDescriptor::GetBackwardSolutionWorkspaceSize(const Handle
conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData};
auto ctx = ExecutionContext{};
ctx.SetStream(&handle);
problem.SetupComputeType(ctx);
if(sol.IsApplicable(ctx, problem))
{
return sol.GetWorkspaceSize(ctx, problem);
Expand Down Expand Up @@ -1299,7 +1303,8 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(const Handle& handle,

const auto problem =
conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData};
const auto ctx = ExecutionContext{&handle};
const auto ctx = ExecutionContext{&handle};
problem.SetupComputeType(ctx);
const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id);
const auto invoke_ctx = conv::DataInvokeParams{
tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetBwd()};
Expand Down Expand Up @@ -1423,6 +1428,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle,
decltype(auto) problem =
conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, direction, 0, alpha_val, beta_val};
ValidateAlphaBeta(problem);
problem.SetupComputeType(ExecutionContext{&handle});
Comment thread
yingluAMD marked this conversation as resolved.
Outdated

if(xDesc.GetType() == miopenInt8)
MIOPEN_THROW(miopenStatusBadParm);
Expand Down Expand Up @@ -1465,6 +1471,7 @@ std::size_t ConvolutionDescriptor::GetWrwSolutionWorkspaceSize(const Handle& han
conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights};
auto ctx = ExecutionContext{};
ctx.SetStream(&handle);
problem.SetupComputeType(ctx);
if(sol.IsApplicable(ctx, problem))
{
return sol.GetWorkspaceSize(ctx, problem);
Expand Down Expand Up @@ -1502,7 +1509,8 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(const Handle& handle,

const auto problem = conv::ProblemDescription{
dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights};
const auto ctx = ExecutionContext{&handle};
const auto ctx = ExecutionContext{&handle};
problem.SetupComputeType(ctx);
const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id);
const auto invoke_ctx = conv::WrWInvokeParams{
tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetWrW()};
Expand Down
Loading
Loading