-
Notifications
You must be signed in to change notification settings - Fork 294
[miopen] Enhance 3D convolution performance in immediate mode #877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8d8f80e
47e973c
c1903a1
314588c
38b9afd
faa4e37
73f1fce
e9a64a1
5a2f726
1f9d1f2
52cb9a4
8e82e15
2bf2d6d
b008a67
ce00a1a
860c4c4
d0ffee5
71fccd5
646c7bb
2648208
d382f82
e94b3e0
3359448
3331267
34366e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ | |
| #endif | ||
| #include <miopen/solver/implicitgemm_ck_util.hpp> | ||
| MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) | ||
| MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE); | ||
|
|
||
| namespace miopen { | ||
| namespace solver { | ||
|
|
@@ -360,7 +361,69 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescrip | |
| FillValidKernelsIDs<DeviceOpGFwdDefaultPtrs<DataType>, CKArgs<DataType>>(problem); | ||
| break; | ||
| } | ||
| index = 0; | ||
| index = 0; | ||
|
|
||
| auto find_kernel = [&valid_kernels = std::as_const(valid_kernels)]( | ||
| const std::size_t& index, const std::string& kernel_id) -> std::size_t { | ||
| // Check if valid_kernels[index] equals kernel_id. | ||
| if(index < valid_kernels.size() && valid_kernels[index] == kernel_id) | ||
| return index; | ||
|
|
||
| // Linear search for kernel_id in valid_kernels. | ||
| auto it = std::find(valid_kernels.begin(), valid_kernels.end(), kernel_id); | ||
| if(it != valid_kernels.end()) | ||
| return static_cast<std::size_t>(it - valid_kernels.begin()); | ||
|
|
||
| // Not found: return 0 | ||
| MIOPEN_LOG_E("Not found :" << index << "-" << kernel_id); | ||
| return 0; | ||
| }; | ||
|
|
||
| // for BF16 and FP16 | ||
| index = env::value(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE); | ||
| if(index == 0 && problem.GetInChannels() > 8 && problem.GetGroupCount() == 1 && | ||
| problem.GetAlphaBetaCase() == DEFAULT) | ||
| { | ||
| int K = problem.GetOutChannels(); | ||
| if(problem.GetInDataType() == miopenBFloat16) | ||
| { | ||
| if(K < 64) | ||
| { | ||
| index = | ||
| find_kernel(38, | ||
| "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" | ||
| "<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, " | ||
| "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); | ||
| } | ||
| else | ||
| { | ||
| index = | ||
| find_kernel(30, | ||
| "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" | ||
| "<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, " | ||
| "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); | ||
| } | ||
| } | ||
| else if(problem.GetInDataType() == miopenHalf) | ||
| { | ||
| if(K < 64) | ||
| { | ||
| index = | ||
| find_kernel(57, | ||
| "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" | ||
| "<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, " | ||
| "BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>"); | ||
| } | ||
| else | ||
| { | ||
| index = | ||
| find_kernel(31, | ||
| "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3" | ||
| "<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, " | ||
| "BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>"); | ||
| } | ||
| } | ||
| } | ||
| kernel_id = valid_kernels[index]; | ||
| } | ||
|
|
||
|
|
@@ -425,6 +488,11 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue( | |
| { | ||
| HeuristicInit(problem); | ||
| assert(!valid_kernels.empty()); | ||
| if(index != 0) | ||
| { | ||
| index = 0; | ||
| kernel_id = valid_kernels[index]; | ||
| } | ||
|
BrianHarrisonAMD marked this conversation as resolved.
|
||
| return true; | ||
| } | ||
| if((index + 1) < valid_kernels.size()) | ||
|
|
@@ -540,6 +608,34 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( | |
| return false; | ||
| } | ||
|
|
||
| float ConvHipImplicitGemm3DGroupFwdXdlops::GetWti( | ||
| const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const | ||
| { | ||
| decltype(auto) xDesc = problem.GetIn(); | ||
| decltype(auto) wDesc = problem.GetWeights(); | ||
|
Comment on lines
+611
to
+615
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm observing build failures on Windows pointing to this code as part of updating the rocm-libraries version used/tested in TheRock at ROCm/TheRock#1195. On Windows we build with CK disabled ( Error logs:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh since it got moved to a CPP it needs the symbol exported now.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ill make a quick PR.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, thank you. I'm testing this now: diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp
index 513b5552f2..f7b66514fd 100644
--- a/projects/miopen/src/include/miopen/conv/solvers.hpp
+++ b/projects/miopen/src/include/miopen/conv/solvers.hpp
@@ -4556,7 +4556,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
- float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
+ MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize(
const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Success:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR here |
||
|
|
||
| if(xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16) | ||
| { | ||
| std::size_t in_n, in_c, w_x, w_y, w_d; | ||
| std::tie(in_n, in_c) = tie_pick<0, 1>()(xDesc.GetLengths()); | ||
| std::tie(w_x, w_y, w_d) = tie_pick<2, 3, 4>()(wDesc.GetLengths()); | ||
| // For cases where the filter shape is not 1x1x1 and the input channel (in_c) is greater | ||
| // than 8, CK's implementation offers better performance. | ||
| if((w_x == 1 && w_y == 1 && w_d == 1) == false) | ||
| { | ||
| if(in_c < 8 && in_n < 4) | ||
| { | ||
| return 0.00002; // force disable | ||
| } | ||
| else | ||
| { | ||
| return 1.0; // force enable | ||
| } | ||
| } | ||
| } | ||
| return 0.02f; | ||
| } | ||
|
|
||
| ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( | ||
| [[maybe_unused]] const ExecutionContext& ctx, | ||
| [[maybe_unused]] const ProblemDescription& problem, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| #include <gtest/gtest.h> | ||
| #include <gtest/group_conv.hpp> | ||
|
|
||
| #include <miopen/tensor.hpp> | ||
| #include <miopen/conv/problem_description.hpp> | ||
| #include <miopen/conv/solvers.hpp> | ||
| #include <sstream> | ||
|
|
||
| using Problem = miopen::conv::ProblemDescription; | ||
| using Config = miopen::solver::conv::PerformanceConfigHipImplicitGemm3DGroupFwdXdlops; | ||
|
|
||
| struct PerfConfigTestCase | ||
| { | ||
| struct group_conv::GroupConvTestConfig<3u> conv; | ||
| miopenDataType_t data_type; | ||
| miopenTensorLayout_t layout; | ||
| std::string arch; | ||
| }; | ||
|
|
||
| std::vector<PerfConfigTestCase> GetPerfConfigTestCases(miopenDataType_t data_type, std::string arch) | ||
| { | ||
| return {{{1, 128, 64, 32, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, | ||
| data_type, | ||
| miopenTensorNCDHW, | ||
| arch}, | ||
| {{1, 128, 64, 192, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, | ||
| data_type, | ||
| miopenTensorNCDHW, | ||
| arch}}; | ||
| } | ||
|
|
||
| template <miopenDataType_t date_type> | ||
| class PerfConfig_HipImplicitGemm3DGroupFwdXdlops | ||
| : public ::testing::TestWithParam<PerfConfigTestCase> | ||
| { | ||
| protected: | ||
| void TestConfigs() | ||
| { | ||
| auto test_case = GetParam(); | ||
|
|
||
| auto&& handle = get_handle(); | ||
| miopen::ExecutionContext ctx(&handle); | ||
| if(test_case.arch != ctx.GetStream().GetDeviceName()) | ||
| GTEST_SKIP(); | ||
|
|
||
| auto input_tensor_desc = | ||
| miopen::TensorDescriptor(test_case.data_type, test_case.conv.GetInput()); | ||
|
|
||
| auto weights_tensor_desc = miopen::TensorDescriptor( | ||
| test_case.data_type, test_case.layout, test_case.conv.GetWeights()); | ||
|
|
||
| auto conv_desc = test_case.conv.GetConv(); | ||
|
|
||
| auto output_desc = conv_desc.GetForwardOutputTensor( | ||
| input_tensor_desc, weights_tensor_desc, test_case.data_type); | ||
|
|
||
| auto problem = miopen::conv::ProblemDescription(input_tensor_desc, | ||
| weights_tensor_desc, | ||
| output_desc, | ||
| conv_desc, | ||
| miopen::conv::Direction::Forward); | ||
|
|
||
| Config cfg; | ||
| cfg.HeuristicInit(problem); | ||
| EXPECT_TRUE(cfg.index != 0) << "index is 0:" << test_case.conv; | ||
| } | ||
| }; | ||
|
|
||
| using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16 = | ||
| PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenBFloat16>; | ||
| using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16 = | ||
| PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenHalf>; | ||
|
|
||
| TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, All) { TestConfigs(); } | ||
| TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, All) { TestConfigs(); } | ||
|
|
||
| INSTANTIATE_TEST_SUITE_P(Full, | ||
| GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, | ||
| testing::ValuesIn(GetPerfConfigTestCases(miopenBFloat16, "gfx942"))); | ||
|
|
||
| INSTANTIATE_TEST_SUITE_P(Full, | ||
| GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, | ||
| testing::ValuesIn(GetPerfConfigTestCases(miopenHalf, "gfx942"))); |
Uh oh!
There was an error while loading. Please reload this page.