Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8d8f80e
select CK kernel
jfactory07 Jul 11, 2025
47e973c
add db and exclude fp32
jfactory07 Jul 18, 2025
c1903a1
add fp16
jfactory07 Jul 18, 2025
314588c
revert fp16 db add handle small k for fp16
jfactory07 Jul 22, 2025
38b9afd
fix auto tune
jfactory07 Jul 22, 2025
faa4e37
refine
jfactory07 Jul 28, 2025
73f1fce
Merge branch 'develop' into jzhou/3dconv
jfactory07 Jul 28, 2025
e9a64a1
refine
jfactory07 Jul 28, 2025
5a2f726
typo
jfactory07 Jul 28, 2025
1f9d1f2
check format
jfactory07 Jul 28, 2025
52cb9a4
Merge branch 'develop' into jzhou/3dconv
ammallya Jul 28, 2025
8e82e15
Merge commit '52cb9a45a6108049c39122e98b211110e6b9df9f' into import/d…
assistant-librarian[bot] Jul 28, 2025
2bf2d6d
refine
jfactory07 Jul 30, 2025
b008a67
refine format
jfactory07 Jul 30, 2025
ce00a1a
add unit test
jfactory07 Jul 30, 2025
860c4c4
handle large n
jfactory07 Jul 30, 2025
d0ffee5
refine
jfactory07 Jul 31, 2025
71fccd5
add new unit test
jfactory07 Jul 31, 2025
646c7bb
typo
jfactory07 Jul 31, 2025
2648208
Merge branch 'develop' into import/develop/ROCm_MIOpen/jzhou_3dconv
JonathanLichtnerAMD Jul 31, 2025
d382f82
Add newline at end of perf_config_HipImplicitGemm3DGroupFwdXdlops.cpp
JonathanLichtnerAMD Jul 31, 2025
e94b3e0
Match unit test naming convention in perf_config_HipImplicitGemm3DGro…
JonathanLichtnerAMD Jul 31, 2025
3359448
Merge branch 'develop' into import/develop/ROCm_MIOpen/jzhou_3dconv
jfactory07 Aug 5, 2025
3331267
Merge branch 'develop' into import/develop/ROCm_MIOpen/jzhou_3dconv
jfactory07 Aug 5, 2025
34366e8
Merge branch 'develop' into import/develop/ROCm_MIOpen/jzhou_3dconv
BrianHarrisonAMD Aug 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions projects/miopen/src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4556,11 +4556,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
/// \ref igemm_get_wti_magic_number
float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override
{
return 0.02f;
};
float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;

MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize(
const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#endif
#include <miopen/solver/implicitgemm_ck_util.hpp>
MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)
MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE);

namespace miopen {
namespace solver {
Expand Down Expand Up @@ -360,7 +361,69 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescrip
FillValidKernelsIDs<DeviceOpGFwdDefaultPtrs<DataType>, CKArgs<DataType>>(problem);
break;
}
index = 0;
index = 0;

auto find_kernel = [&valid_kernels = std::as_const(valid_kernels)](
const std::size_t& index, const std::string& kernel_id) -> std::size_t {
// Check if valid_kernels[index] equals kernel_id.
if(index < valid_kernels.size() && valid_kernels[index] == kernel_id)
return index;

// Linear search for kernel_id in valid_kernels.
auto it = std::find(valid_kernels.begin(), valid_kernels.end(), kernel_id);
if(it != valid_kernels.end())
return static_cast<std::size_t>(it - valid_kernels.begin());

// Not found: return 0
MIOPEN_LOG_E("Not found :" << index << "-" << kernel_id);
return 0;
};

// for BF16 and FP16
index = env::value(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE);
if(index == 0 && problem.GetInChannels() > 8 && problem.GetGroupCount() == 1 &&
problem.GetAlphaBetaCase() == DEFAULT)
{
int K = problem.GetOutChannels();
if(problem.GetInDataType() == miopenBFloat16)
{
if(K < 64)
{
index =
find_kernel(38,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
else
{
index =
find_kernel(30,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
}
else if(problem.GetInDataType() == miopenHalf)
{
if(K < 64)
{
index =
find_kernel(57,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, "
"BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>");
}
else
{
index =
find_kernel(31,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
}
}
Comment thread
jfactory07 marked this conversation as resolved.
kernel_id = valid_kernels[index];
}

Expand Down Expand Up @@ -425,6 +488,11 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue(
{
HeuristicInit(problem);
assert(!valid_kernels.empty());
if(index != 0)
{
index = 0;
kernel_id = valid_kernels[index];
}
Comment thread
BrianHarrisonAMD marked this conversation as resolved.
return true;
}
if((index + 1) < valid_kernels.size())
Expand Down Expand Up @@ -540,6 +608,34 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
return false;
}

float ConvHipImplicitGemm3DGroupFwdXdlops::GetWti(
const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const
{
decltype(auto) xDesc = problem.GetIn();
decltype(auto) wDesc = problem.GetWeights();
Comment on lines +611 to +615
Copy link
Copy Markdown
Member

@ScottTodd ScottTodd Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm observing build failures on Windows pointing to this code as part of updating the rocm-libraries version used/tested in TheRock at ROCm/TheRock#1195.

On Windows we build with CK disabled (-DMIOPEN_USE_COMPOSABLEKERNEL=OFF) as well as some other possibly relevant options: https://github.com/ROCm/TheRock/blob/9b873dd83d594a6abaa8db3890c09ab445651df0/ml-libs/CMakeLists.txt#L102-L115

Error logs:

[build] [1/5] Building sub-project MIOpen (in background)
[build] FAILED: ml-libs/MIOpen/stamp/build.stamp D:/projects/TheRock/build/ml-libs/MIOpen/stamp/build.stamp 
[build] C:\Windows\system32\cmd.exe /C "cd /D D:\projects\TheRock\build\ml-libs\MIOpen\build && d:\projects\TheRock\.venv\Scripts\python D:/projects/TheRock/build_tools/teatime.py --log-timestamps --label MIOpen --interactive D:/projects/TheRock/build/logs/MIOpen_build.log -- "C:/Program Files/CMake/bin/cmake.exe" -E env --unset=ROCM_PATH --unset=ROCM_DIR --unset=HIP_PATH --unset=HIP_DIR -- "C:/Program Files/CMake/bin/cmake.exe" --build D:/projects/TheRock/build/ml-libs/MIOpen/build && "C:\Program Files\CMake\bin\cmake.exe" -E touch D:/projects/TheRock/build/ml-libs/MIOpen/stamp/build.stamp"
[build] [MIOpen] [0/2] Re-checking globbed directories...
[build] [MIOpen] [1/2] Linking CXX executable bin\miopen_gtest.exe
[build] [MIOpen] FAILED: bin/miopen_gtest.exe 
[build] [MIOpen] C:\Windows\system32\cmd.exe /C "cd . && D:\projects\TheRock\build\core\clr\dist\lib\llvm\bin\clang++.exe -nostartfiles -nostdlib -DWIN32 -DWIN32_LEAN_AND_MEAN -D_CRT_SECURE_NO_WARNINGS -DNOMINMAX -fms-extensions -fms-compatibility -D_ENABLE_EXTENDED_ALIGNED_STORAGE  -Wno-documentation-unknown-command -Wno-documentation-pedantic -Wno-unused-command-line-argument -Wno-explicit-specialization-storage-class -Wno-ignored-attributes -Wno-unknown-attributes -Wno-duplicate-decl-specifier --hip-path=D:/projects/TheRock/build/core/clr/dist --hip-device-lib-path=D:/projects/TheRock/build/core/clr/dist/lib/llvm/amdgcn/bitcode -O3 -DNDEBUG -D_DLL -D_MT -Xclang --dependent-lib=msvcrt -L D:/projects/TheRock/build/third-party/sysdeps/windows/zlib/build/stage/lib/rocm_sysdeps/lib  -L D:/projects/TheRock/build/third-party/sysdeps/windows/zstd/build/stage/lib/rocm_sysdeps/lib  -L D:/projects/TheRock/build/compiler/amd-llvm/stage/lib/llvm/lib  -L D:/projects/TheRock/build/core/clr/stage/lib  -L D:/projects/TheRock/build/third-party/host-blas/stage/lib/host-math/lib  -L D:/projects/TheRock/build/third-party/sysdeps/windows/bzip2/build/stage/lib/rocm_sysdeps/lib  -L D:/projects/TheRock/build/third-party/sysdeps/windows/sqlite3/build/stage/lib/rocm_sysdeps/lib   -Xlinker /subsystem:console   -fuse-ld=lld-link @CMakeFiles\miopen_gtest.rsp -o bin\miopen_gtest.exe -Xlinker /MANIFEST:EMBED -Xlinker /implib:lib\miopen_gtest.lib -Xlinker /pdb:bin\miopen_gtest.pdb -Xlinker /version:0.0   && cd ."
[build] [MIOpen] lld-link: error: undefined symbol: public: virtual float __cdecl miopen::solver::conv::ConvHipImplicitGemm3DGroupFwdXdlops::GetWti(struct miopen::ExecutionContext const &, struct miopen::conv::ProblemDescription const &) const
[build] [MIOpen] >>> referenced by test/gtest/CMakeFiles/miopen_gtest.dir/group_conv3d_fwd.cpp.obj:(const miopen::solver::conv::ConvHipImplicitGemm3DGroupFwdXdlops::`vftable')
[build] [MIOpen] >>> referenced by test/gtest/CMakeFiles/miopen_gtest.dir/nonpack_conv3d_fwd.cpp.obj
[build] [MIOpen] clang++: error: linker command failed with exit code 1 (use -v to see invocation)

[build] [MIOpen] ninja: build stopped: subcommand failed.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh since it got moved to a CPP it needs the symbol exported now.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ill make a quick PR.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, thank you.

I'm testing this now:

diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp
index 513b5552f2..f7b66514fd 100644
--- a/projects/miopen/src/include/miopen/conv/solvers.hpp
+++ b/projects/miopen/src/include/miopen/conv/solvers.hpp
@@ -4556,7 +4556,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
     GetSolution(const ExecutionContext&,
                 const miopen::conv::ProblemDescription&,
                 const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
-    float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
+    MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;

     MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize(
         const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Success:

165.2	[200/201] Linking CXX executable bin\miopen_gtest.exe
END	1754599261.0221655	165.21049785614014

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR here


if(xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16)
{
std::size_t in_n, in_c, w_x, w_y, w_d;
std::tie(in_n, in_c) = tie_pick<0, 1>()(xDesc.GetLengths());
std::tie(w_x, w_y, w_d) = tie_pick<2, 3, 4>()(wDesc.GetLengths());
// For cases where the filter shape is not 1x1x1 and the input channel (in_c) is greater
// than 8, CK's implementation offers better performance.
if((w_x == 1 && w_y == 1 && w_d == 1) == false)
{
if(in_c < 8 && in_n < 4)
{
return 0.00002; // force disable
}
else
{
return 1.0; // force enable
}
}
}
return 0.02f;
}

ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
[[maybe_unused]] const ExecutionContext& ctx,
[[maybe_unused]] const ProblemDescription& problem,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <gtest/gtest.h>
#include <gtest/group_conv.hpp>

#include <miopen/tensor.hpp>
#include <miopen/conv/problem_description.hpp>
#include <miopen/conv/solvers.hpp>
#include <sstream>

using Problem = miopen::conv::ProblemDescription;
using Config = miopen::solver::conv::PerformanceConfigHipImplicitGemm3DGroupFwdXdlops;

struct PerfConfigTestCase
{
struct group_conv::GroupConvTestConfig<3u> conv;
miopenDataType_t data_type;
miopenTensorLayout_t layout;
std::string arch;
};

std::vector<PerfConfigTestCase> GetPerfConfigTestCases(miopenDataType_t data_type, std::string arch)
{
return {{{1, 128, 64, 32, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},
data_type,
miopenTensorNCDHW,
arch},
{{1, 128, 64, 192, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},
data_type,
miopenTensorNCDHW,
arch}};
}

template <miopenDataType_t date_type>
class PerfConfig_HipImplicitGemm3DGroupFwdXdlops
: public ::testing::TestWithParam<PerfConfigTestCase>
{
protected:
void TestConfigs()
{
auto test_case = GetParam();

auto&& handle = get_handle();
miopen::ExecutionContext ctx(&handle);
if(test_case.arch != ctx.GetStream().GetDeviceName())
GTEST_SKIP();

auto input_tensor_desc =
miopen::TensorDescriptor(test_case.data_type, test_case.conv.GetInput());

auto weights_tensor_desc = miopen::TensorDescriptor(
test_case.data_type, test_case.layout, test_case.conv.GetWeights());

auto conv_desc = test_case.conv.GetConv();

auto output_desc = conv_desc.GetForwardOutputTensor(
input_tensor_desc, weights_tensor_desc, test_case.data_type);

auto problem = miopen::conv::ProblemDescription(input_tensor_desc,
weights_tensor_desc,
output_desc,
conv_desc,
miopen::conv::Direction::Forward);

Config cfg;
cfg.HeuristicInit(problem);
EXPECT_TRUE(cfg.index != 0) << "index is 0:" << test_case.conv;
}
};

using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16 =
PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenBFloat16>;
using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16 =
PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenHalf>;

TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, All) { TestConfigs(); }
TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, All) { TestConfigs(); }

INSTANTIATE_TEST_SUITE_P(Full,
GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16,
testing::ValuesIn(GetPerfConfigTestCases(miopenBFloat16, "gfx942")));

INSTANTIATE_TEST_SUITE_P(Full,
GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16,
testing::ValuesIn(GetPerfConfigTestCases(miopenHalf, "gfx942")));