Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions projects/miopen/src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4556,11 +4556,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
/// \ref igemm_get_wti_magic_number
float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override
{
return 0.02f;
};
float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;

MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize(
const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#endif
#include <miopen/solver/implicitgemm_ck_util.hpp>
MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)
MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE);

namespace miopen {
namespace solver {
Expand Down Expand Up @@ -360,7 +361,69 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescrip
FillValidKernelsIDs<DeviceOpGFwdDefaultPtrs<DataType>, CKArgs<DataType>>(problem);
break;
}
index = 0;
index = 0;

auto find_kernel = [&valid_kernels = std::as_const(valid_kernels)](
const std::size_t& index, const std::string& kernel_id) -> std::size_t {
// Check if valid_kernels[index] equals kernel_id.
if(index < valid_kernels.size() && valid_kernels[index] == kernel_id)
return index;

// Linear search for kernel_id in valid_kernels.
auto it = std::find(valid_kernels.begin(), valid_kernels.end(), kernel_id);
if(it != valid_kernels.end())
return static_cast<std::size_t>(it - valid_kernels.begin());

// Not found: return 0
MIOPEN_LOG_E("Not found :" << index << "-" << kernel_id);
return 0;
};

// for BF16 and FP16
index = env::value(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_IDX_OVERRIDE);
if(index == 0 && problem.GetInChannels() > 8 && problem.GetGroupCount() == 1 &&
problem.GetAlphaBetaCase() == DEFAULT)
{
int K = problem.GetOutChannels();
if(problem.GetInDataType() == miopenBFloat16)
{
if(K < 64)
{
index =
find_kernel(38,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
else
{
index =
find_kernel(30,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
}
else if(problem.GetInDataType() == miopenHalf)
{
if(K < 64)
{
index =
find_kernel(57,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, "
"BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1>");
}
else
{
index =
find_kernel(31,
"DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"
"<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, "
"BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3>");
}
}
}
kernel_id = valid_kernels[index];
}

Expand Down Expand Up @@ -425,6 +488,11 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue(
{
HeuristicInit(problem);
assert(!valid_kernels.empty());
if(index != 0)
{
index = 0;
kernel_id = valid_kernels[index];
}
return true;
}
if((index + 1) < valid_kernels.size())
Expand Down Expand Up @@ -540,6 +608,34 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
return false;
}

float ConvHipImplicitGemm3DGroupFwdXdlops::GetWti(
const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const
{
decltype(auto) xDesc = problem.GetIn();
decltype(auto) wDesc = problem.GetWeights();

if(xDesc.GetType() == miopenHalf || xDesc.GetType() == miopenBFloat16)
{
std::size_t in_n, in_c, w_x, w_y, w_d;
std::tie(in_n, in_c) = tie_pick<0, 1>()(xDesc.GetLengths());
std::tie(w_x, w_y, w_d) = tie_pick<2, 3, 4>()(wDesc.GetLengths());
// For cases where the filter shape is not 1x1x1 and the input channel (in_c) is greater
// than 8, CK's implementation offers better performance.
if((w_x == 1 && w_y == 1 && w_d == 1) == false)
{
if(in_c < 8 && in_n < 4)
{
return 0.00002; // force disable
}
else
{
return 1.0; // force enable
}
}
}
return 0.02f;
}

ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
[[maybe_unused]] const ExecutionContext& ctx,
[[maybe_unused]] const ProblemDescription& problem,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <gtest/gtest.h>
#include <gtest/group_conv.hpp>

#include <miopen/tensor.hpp>
#include <miopen/conv/problem_description.hpp>
#include <miopen/conv/solvers.hpp>
#include <sstream>

using Problem = miopen::conv::ProblemDescription;
using Config = miopen::solver::conv::PerformanceConfigHipImplicitGemm3DGroupFwdXdlops;

struct PerfConfigTestCase
{
struct group_conv::GroupConvTestConfig<3u> conv;
miopenDataType_t data_type;
miopenTensorLayout_t layout;
std::string arch;
};

std::vector<PerfConfigTestCase> GetPerfConfigTestCases(miopenDataType_t data_type, std::string arch)
{
return {{{1, 128, 64, 32, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},
data_type,
miopenTensorNCDHW,
arch},
{{1, 128, 64, 192, {3, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}},
data_type,
miopenTensorNCDHW,
arch}};
}

template <miopenDataType_t date_type>
class PerfConfig_HipImplicitGemm3DGroupFwdXdlops
: public ::testing::TestWithParam<PerfConfigTestCase>
{
protected:
void TestConfigs()
{
auto test_case = GetParam();

auto&& handle = get_handle();
miopen::ExecutionContext ctx(&handle);
if(test_case.arch != ctx.GetStream().GetDeviceName())
GTEST_SKIP();

auto input_tensor_desc =
miopen::TensorDescriptor(test_case.data_type, test_case.conv.GetInput());

auto weights_tensor_desc = miopen::TensorDescriptor(
test_case.data_type, test_case.layout, test_case.conv.GetWeights());

auto conv_desc = test_case.conv.GetConv();

auto output_desc = conv_desc.GetForwardOutputTensor(
input_tensor_desc, weights_tensor_desc, test_case.data_type);

auto problem = miopen::conv::ProblemDescription(input_tensor_desc,
weights_tensor_desc,
output_desc,
conv_desc,
miopen::conv::Direction::Forward);

Config cfg;
cfg.HeuristicInit(problem);
EXPECT_TRUE(cfg.index != 0) << "index is 0:" << test_case.conv;
}
};

using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16 =
PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenBFloat16>;
using GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16 =
PerfConfig_HipImplicitGemm3DGroupFwdXdlops<miopenHalf>;

TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16, All) { TestConfigs(); }
TEST_P(GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16, All) { TestConfigs(); }

INSTANTIATE_TEST_SUITE_P(Full,
GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_BFP16,
testing::ValuesIn(GetPerfConfigTestCases(miopenBFloat16, "gfx942")));

INSTANTIATE_TEST_SUITE_P(Full,
GPU_PerfConfig_HipImplicitGemm3DGroupFwdXdlops_FP16,
testing::ValuesIn(GetPerfConfigTestCases(miopenHalf, "gfx942")));