diff --git a/projects/miopen/Jenkinsfile b/projects/miopen/Jenkinsfile index b82468308a2e..2e9d2e0a35f5 100644 --- a/projects/miopen/Jenkinsfile +++ b/projects/miopen/Jenkinsfile @@ -153,6 +153,10 @@ pipeline { name: "DATATYPE_FP32", defaultValue: true, description: "") + booleanParam( + name: "DATATYPE_TF32", + defaultValue: true, + description: "") booleanParam( name: "DATATYPE_FP16", defaultValue: true, @@ -196,6 +200,7 @@ pipeline { Bf16_flags = " -DMIOPEN_TEST_BFLOAT16=On" Int8_flags = " -DMIOPEN_TEST_INT8=On" Full_test = " -DMIOPEN_TEST_ALL=On" + Tf32_flags = " -DMIOPEN_TEST_TF32=On" gfx908_flags = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx908" gfx90a_flags = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx90a" @@ -494,6 +499,24 @@ pipeline { } } } + stage('TF32 Hip Install All gfx942') { + when { + beforeAgent true + expression { params.TARGET_GFX942 && params.DATATYPE_TF32 } + } + options { + retry(2) + } + agent{ label rocmnode("gfx942") } + steps{ + runBuildAndSingleGtestJob(Full_test + Tf32_flags + gfx942_flags, Build_timeout_minutes) + } + post { + always { + cleanWs() + } + } + } // GFX1101 Tests stage('Fp16 Hip Install All gfx1101') { when { diff --git a/projects/miopen/driver/conv_driver.hpp b/projects/miopen/driver/conv_driver.hpp index 619bb94b4922..a26adea2c50f 100644 --- a/projects/miopen/driver/conv_driver.hpp +++ b/projects/miopen/driver/conv_driver.hpp @@ -452,6 +452,13 @@ class ConvDriver : public Driver constexpr bool is_bfp8 = std::is_same::value; if(is_bfp8 || is_fp8 || TensorsCasted()) tolerance *= 37.0; + + { // tf32 has same mantissa length as fp16 + auto math_type_ = inflags.GetValueInt("math_type"); + if(std::is_same_v && + (miopen::EnvEnableTF32() || (math_type_ == miopenMathDefault))) + tolerance = 8.2e-3; + } return tolerance; } @@ -864,6 +871,8 @@ int ConvDriver::GetandSetData() warmupConvDesc, static_cast(miopenConvolutionFindModeNormal)); // Repeat via hidden API. miopenSetConvolutionGroupCount(warmupConvDesc, group_count); + miopenSetConvolutionAttribute( + warmupConvDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, inflags.GetValueInt("math_type")); int warmup_out_len_size = miopen::deref(warmupInputTensor).GetNumDims(); std::vector warmup_out_len(warmup_out_len_size); @@ -1018,6 +1027,8 @@ int ConvDriver::AddCmdLineArgs() "0", "MIOpen tuning policy (Default=0, or no tuning policy set)", "int"); + // TODO:(LYM) change back to 0 when TF32 is fully supported + inflags.AddInputFlag("math_type", 'M', "1", "math type of compute (Default=1)", "int"); return 0; } @@ -1222,6 +1233,14 @@ int ConvDriver::SetConvDescriptorFromCmdLineArgs() miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data()); } + auto math_type_ = inflags.GetValueInt("math_type"); + if(math_type_ < miopenMathDefault || math_type_ > miopenMathPedantic) + { + std::cout << "Invalid math_type value: " << math_type_ << std::endl; + exit(0); // NOLINT (concurrency-mt-unsafe) + } + miopenSetConvolutionAttribute(convDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, math_type_); + return miopenStatusSuccess; } diff --git a/projects/miopen/include/miopen/miopen.h b/projects/miopen/include/miopen/miopen.h index f54dff0aa4c9..e0bc042524f8 100644 --- a/projects/miopen/include/miopen/miopen.h +++ b/projects/miopen/include/miopen/miopen.h @@ -120,6 +120,14 @@ typedef enum miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */ } miopenStatus_t; +typedef enum +{ + // TODO:(LYM) temporary use Pedantic as default until TF32 is fully supported + miopenMathDefault = 0, /*!< Use TF32 if possible */ + miopenMathPedantic = + 1, /*!< Default MathType. Strict IEEE compliance. Don't allow datatype down conversion. */ +} miopenMathType_t; + #ifdef MIOPEN_BETA_API typedef enum { @@ -639,6 +647,9 @@ typedef enum #else // miopenReserved1 = 2, #endif + // TODO:(LYM) temporarily use Pedantic as default until TF32 is fully supported + MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE = + 3, /*!< refer to miopenMathType_t,default is miopenMathPedantic >*/ } miopenConvolutionAttrib_t; /*! @ingroup convolutions diff --git a/projects/miopen/src/CMakeLists.txt b/projects/miopen/src/CMakeLists.txt index 60fc6ad62aba..7c8fc4697d7c 100644 --- a/projects/miopen/src/CMakeLists.txt +++ b/projects/miopen/src/CMakeLists.txt @@ -926,6 +926,9 @@ if(MIOPEN_USE_COMPOSABLEKERNEL) # Use the aliased targets when we pull CK from /opt/rocm or other place on disc. set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_conv_operations hip::host) endif() + if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950") + target_compile_definitions(MIOpen PRIVATE CK_ENABLE_TF32) + endif() endif() if(WIN32) diff --git a/projects/miopen/src/conv/problem_description.cpp b/projects/miopen/src/conv/problem_description.cpp index b5492fa7a9a9..c94f2a7ab94a 100644 --- a/projects/miopen/src/conv/problem_description.cpp +++ b/projects/miopen/src/conv/problem_description.cpp @@ -215,9 +215,14 @@ void ProblemDescription::MakeNetworkConfig(std::string& conf_key) const ss << 'x' << GetWeightsLayout(); ss << 'x' << GetOutLayout(); } - ss << 'x' << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType()); + const auto data_type = + EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType()); + ss << 'x' << data_type; std::ostringstream optional; + if(data_type == "FP32" && UseTF32()) + optional << "TF32" << 'x'; + if(const auto ct = GetInCastType()) optional << "ci" << GetDataTypeName(*ct); if(const auto ct = GetWeightsCastType()) @@ -271,10 +276,12 @@ void ProblemDescription::Serialize(std::ostream& stream) const stream << sep << GetWeightsLayout(); stream << sep << GetOutLayout(); } - stream << sep << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType()); + // clang-format on + const auto data_type = + EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType()); + stream << sep << data_type; stream << sep << GetDirectionStr(); - // clang-format on // New performance config entries shall come into variable/optional part of db key. // This is to support backward compatibility with previous versions of databases. std::ostringstream optional; @@ -290,6 +297,10 @@ void ProblemDescription::Serialize(std::ostream& stream) const if(const auto ct = GetOutCastType()) optional << "_co" << GetDataTypeName(*ct); + // cx indicates compute datatype + if(data_type == "FP32" && UseTF32()) + optional << "_cxTF32"; + SerializeStrides(optional, in, out, weights, sep); } if(!optional.str().empty()) @@ -350,6 +361,14 @@ void ProblemDescription::SetupFloats(ExecutionContext& ctx) const << "x" << GetDataTypeName(GetOutDataType())); } +void ProblemDescription::SetupComputeType(const ExecutionContext& ctx) const +{ + if(miopen::IsTF32Supported(ctx.GetStream().GetDeviceName()) && conv.EnableTF32()) + { + use_tf32 = true; + } +} + std::string ProblemDescription::ComputeLayout(const TensorDescriptor& td) const { return td.GetLayout_str(); diff --git a/projects/miopen/src/conv/solver_finders.cpp b/projects/miopen/src/conv/solver_finders.cpp index 2a827bc239a1..1789656388c8 100644 --- a/projects/miopen/src/conv/solver_finders.cpp +++ b/projects/miopen/src/conv/solver_finders.cpp @@ -330,7 +330,8 @@ std::vector EvaluateInvokers(const Handle& handle, MIOPEN_THROW_IF(elapsed <= 0, "Invalid elapsed time detected in EvaluateInvokers"); - MIOPEN_LOG_I(sol << ": " << elapsed << (elapsed < best ? " < " : " >= ") << best); + MIOPEN_LOG_I("solution(current vs best):" << sol << ": " << elapsed + << (elapsed < best ? " < " : " >= ") << best); if(elapsed < best) { best = elapsed; diff --git a/projects/miopen/src/convolution.cpp b/projects/miopen/src/convolution.cpp index 5bd6c1756865..8e43e0e1bb20 100644 --- a/projects/miopen/src/convolution.cpp +++ b/projects/miopen/src/convolution.cpp @@ -468,6 +468,16 @@ std::size_t ConvolutionDescriptor::GetWorkSpaceSize(ExecutionContext ctx, return workspace_size; } +bool ConvolutionDescriptor::EnableTF32() const +{ + // TODO:(LYM) change back to && when TF32 is fully supported + if((miopen::EnvEnableTF32() || + (static_cast(attribute.Get(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)) == + miopenMathDefault))) + return true; + return false; +} + std::ostream& operator<<(std::ostream& stream, const ConvolutionDescriptor& c) { stream << "conv" << c.spatialDim << "d, "; @@ -540,6 +550,18 @@ void ConvolutionAttribute::Set(miopenConvolutionAttrib_t attr, int value) } fp8rounding_mode.rounding_mode = rounding_mode; } + else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE) + { + const auto math_type_ = static_cast(value); + if(math_type_ != miopenMathDefault && math_type_ != miopenMathPedantic) + { + MIOPEN_THROW(miopenStatusBadParm, + "[Set conv attribute] Error: Attempt to set invalid value for " + "MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE: " + + std::to_string(value)); + } + math_type.value = math_type_; + } else { MIOPEN_THROW(miopenStatusBadParm, @@ -556,6 +578,8 @@ int ConvolutionAttribute::Get(miopenConvolutionAttrib_t attr) const return static_cast(fp8rounding_mode.rounding_mode); else if(attr == MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC) return deterministic.value; + else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE) + return math_type.value; MIOPEN_THROW(miopenStatusBadParm, "[Get conv attribute] Error: Attribute [" + std::to_string(static_cast(attr)) + "] does not exist."); diff --git a/projects/miopen/src/convolution_api.cpp b/projects/miopen/src/convolution_api.cpp index 1bcbe53b162c..d84774bc2772 100644 --- a/projects/miopen/src/convolution_api.cpp +++ b/projects/miopen/src/convolution_api.cpp @@ -67,6 +67,7 @@ static inline auto MakeFwdCtxAndProblem(miopenHandle_t handle, auto ctx = ExecutionContext{&miopen::deref(handle)}; problem.SetupFloats(ctx); + problem.SetupComputeType(ctx); return std::make_tuple(std::move(ctx), std::move(problem)); } @@ -86,6 +87,7 @@ static inline auto MakeBwdCtxAndProblem(miopenHandle_t handle, auto ctx = ExecutionContext{&miopen::deref(handle)}; problem.SetupFloats(ctx); + problem.SetupComputeType(ctx); return std::make_tuple(std::move(ctx), std::move(problem)); } @@ -111,6 +113,7 @@ static inline auto MakeWrWCtxAndProblem(miopenHandle_t handle, auto ctx = ExecutionContext{&miopen::deref(handle)}; problem.SetupFloats(ctx); + problem.SetupComputeType(ctx); return std::make_tuple(std::move(ctx), std::move(problem)); } diff --git a/projects/miopen/src/include/miopen/conv/problem_description.hpp b/projects/miopen/src/include/miopen/conv/problem_description.hpp index cdcf3f0ca302..120c1117a194 100644 --- a/projects/miopen/src/include/miopen/conv/problem_description.hpp +++ b/projects/miopen/src/include/miopen/conv/problem_description.hpp @@ -295,6 +295,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase return GetInCastType() || GetWeightsCastType() || GetOutCastType(); } + bool UseTF32() const { return use_tf32; } + // To be used in Solvers that do not implement ALT FP16 kernels. // Those Solvers must be non-applicable for gfx90a when this function returns true. bool IsGfx90aFp16altRequired() const @@ -395,6 +397,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase self.GetInDataType(), self.GetWeightsDataType(), self.GetOutDataType()); f(data_type, "data_type"); f(self.GetDirectionStr(), "direction"); + if(data_type == "FP32" && self.UseTF32()) + f("TF32", "compute_datatype"); } template @@ -406,6 +410,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase } void SetupFloats(ExecutionContext& ctx) const; + void SetupComputeType(const ExecutionContext& ctx) const; private: std::string ComputeLayout(const TensorDescriptor& td) const; @@ -425,6 +430,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase Scalar alpha = Scalar(1.0); Scalar beta = Scalar(0.0); miopenAlphaBetaCase_t alpha_beta_case = DEFAULT; + mutable bool use_tf32 = false; }; } // namespace conv diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp index befb51c91f31..d880ea180b8f 100644 --- a/projects/miopen/src/include/miopen/conv/solvers.hpp +++ b/projects/miopen/src/include/miopen/conv/solvers.hpp @@ -4436,6 +4436,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops MIOPEN_INTERNALS_EXPORT bool IsModelApplicable(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem) const; + bool UseTF32() const { return use_tf32; } private: #if MIOPEN_ENABLE_AI_KERNEL_TUNING @@ -4451,6 +4452,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops void Init(const miopen::conv::ProblemDescription&); template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemmGroupFwdXdlops final @@ -4524,13 +4526,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const; + bool UseTF32() const { return use_tf32; } private: - template - void Init(const miopen::conv::ProblemDescription&); - template + template + bool Init(const miopen::conv::ProblemDescription&); + template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; void InitValidKernels(const miopen::conv::ProblemDescription& problem); + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemm3DGroupFwdXdlops final @@ -4567,7 +4571,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final bool MayNeedWorkspace() const override { return true; } private: - template + template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; }; @@ -4601,13 +4605,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& other) const; + bool UseTF32() const { return use_tf32; } private: - template - void Init(const miopen::conv::ProblemDescription&); - template + template + bool Init(const miopen::conv::ProblemDescription&); + template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; void InitValidKernels(const miopen::conv::ProblemDescription& problem); + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemm3DGroupWrwXdlops final @@ -4647,10 +4653,10 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final bool MayNeedWorkspace() const override { return true; } private: - template + template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; - template + template std::size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription&) const; size_t GetCKMaxWorkspaceSize(const miopen::conv::ProblemDescription& problem) const; }; @@ -4685,13 +4691,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& other) const; + bool UseTF32() const { return use_tf32; } private: - template - void Init(const miopen::conv::ProblemDescription&); - template + template + bool Init(const miopen::conv::ProblemDescription&); + template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; void InitValidKernels(const miopen::conv::ProblemDescription& problem); + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemm3DGroupBwdXdlops final @@ -4731,7 +4739,7 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final bool MayNeedWorkspace() const override { return true; } private: - template + template bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; }; @@ -4769,6 +4777,7 @@ struct PerformanceConfigHipImplicitGemmGroupBwdXdlops MIOPEN_INTERNALS_EXPORT bool IsModelApplicable(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem) const; + bool UseTF32() const { return use_tf32; } private: #if MIOPEN_ENABLE_AI_KERNEL_TUNING @@ -4787,6 +4796,7 @@ struct PerformanceConfigHipImplicitGemmGroupBwdXdlops void Init(const miopen::conv::ProblemDescription&); template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemmGroupBwdXdlops final @@ -4865,6 +4875,7 @@ struct PerformanceConfigHipImplicitGemmGroupWrwXdlops MIOPEN_INTERNALS_EXPORT bool IsModelApplicable(const ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem) const; + bool UseTF32() const { return use_tf32; } private: #if MIOPEN_ENABLE_AI_KERNEL_TUNING @@ -4883,6 +4894,7 @@ struct PerformanceConfigHipImplicitGemmGroupWrwXdlops void Init(const miopen::conv::ProblemDescription&); template bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; + mutable bool use_tf32 = false; }; struct ConvHipImplicitGemmGroupWrwXdlops final diff --git a/projects/miopen/src/include/miopen/convolution.hpp b/projects/miopen/src/include/miopen/convolution.hpp index 86a574cb32f7..eb16142ef1cb 100644 --- a/projects/miopen/src/include/miopen/convolution.hpp +++ b/projects/miopen/src/include/miopen/convolution.hpp @@ -50,8 +50,15 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC) MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE) MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED) +// disable TF32 by default temporarily until we fully complete this feature. +// TODO:(LYM) change back +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TF32_OVERRIDE, 0); +MIOPEN_DECLARE_ENV_VAR_BOOL(NVIDIA_TF32_OVERRIDE, 0); + namespace miopen { +MIOPEN_INTERNALS_EXPORT bool EnvEnableTF32(); + namespace conv { struct ProblemDescription; } // namespace conv @@ -129,6 +136,17 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionAttribute } } deterministic; + class MathType + { + // temporary set default to pedantic until we fully complete this feature. + // TODO:(LYM) change back + miopenMathType_t value = miopenMathPedantic; + friend struct ConvolutionAttribute; + + public: + inline int Get() const { return value; } + } math_type; + /// Tri-state attribute values: /// * -1: Default (attribute-specific). /// * 0: Disabled/Yes. @@ -347,6 +365,7 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionDescriptor : miopenConvolutionDescript Data_t dw, Data_t workSpace, std::size_t workSpaceSize) const; + miopenMathType_t GetMathType() const; std::size_t spatialDim; miopenConvolutionMode_t mode; @@ -370,6 +389,8 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionDescriptor : miopenConvolutionDescript std::size_t GetSolutionCountFallback(const ExecutionContext& ctx, const conv::ProblemDescription& problem) const; + bool EnableTF32() const; + friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv); friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv); diff --git a/projects/miopen/src/include/miopen/fusion/context.hpp b/projects/miopen/src/include/miopen/fusion/context.hpp index 435097a68a9a..9cc91f519340 100644 --- a/projects/miopen/src/include/miopen/fusion/context.hpp +++ b/projects/miopen/src/include/miopen/fusion/context.hpp @@ -42,6 +42,7 @@ struct FusionContext : ExecutionContext { auto ctx = ExecutionContext{*this}; conv_problem.SetupFloats(ctx); + conv_problem.SetupComputeType(ctx); return ctx; } }; diff --git a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp index 00acf5cae76e..e3909faf17be 100644 --- a/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/projects/miopen/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -76,7 +76,7 @@ inline static bool NextCKSplitkValue(int& v) #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL namespace conv { -template +template using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight< 2, ck::tensor_layout::convolution::NHWGC, @@ -87,12 +87,13 @@ using DeviceOpGWrw = ck::tensor_operation::device::DeviceGroupedConvBwdWeight< DataType, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; -template -using DeviceOpGWrwPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory>; + ck::tensor_operation::element_wise::PassThrough, + ComputeType>; +template +using DeviceOpGWrwPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGWrw>; -template +template using DeviceOpGBwd = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD< 2, ck::tensor_layout::convolution::NHWGK, @@ -105,11 +106,12 @@ using DeviceOpGBwd = ck::tensor_operation::device::DeviceGroupedConvBwdDataMulti DataType, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; + ck::tensor_operation::element_wise::PassThrough, + ComputeType>; -template -using DeviceOpGBwdPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory>; +template +using DeviceOpGBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGBwd>; using InLayout = ck::tensor_layout::convolution::NDHWGC; using WeiLayout = ck::tensor_layout::convolution::GKZYXC; @@ -118,7 +120,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Bilinear = ck::tensor_operation::element_wise::Bilinear; using Scale = ck::tensor_operation::element_wise::Scale; -template +template using DeviceOpGBwdWeightDefault = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<3, InLayout, @@ -129,9 +131,10 @@ using DeviceOpGBwdWeightDefault = DataType, PassThrough, PassThrough, - PassThrough>; + PassThrough, + ComputeType>; -template +template using DeviceOpGBwdWeightBilinear = ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<3, InLayout, @@ -144,9 +147,10 @@ using DeviceOpGBwdWeightBilinear = ck::Tuple, PassThrough, Bilinear, - PassThrough>; + PassThrough, + ComputeType>; -template +template using DeviceOpGBwdWeightScale = ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<3, InLayout, @@ -159,22 +163,23 @@ using DeviceOpGBwdWeightScale = ck::Tuple<>, PassThrough, Scale, - PassThrough>; + PassThrough, + ComputeType>; -template +template using DeviceOpGBwdWeightDefaultPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdWeightDefault>; + DeviceOpGBwdWeightDefault>; -template +template using DeviceOpGBwdWeightBilinearPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdWeightBilinear>; + DeviceOpGBwdWeightBilinear>; -template +template using DeviceOpGBwdWeightScalePtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdWeightScale>; + DeviceOpGBwdWeightScale>; } // namespace conv @@ -246,22 +251,27 @@ inline constexpr bool IsSplitKNeeded() { return std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v> || std::is_same_v>; } @@ -1469,7 +1479,8 @@ template ConvSolution MakeSolutionGroupConvImplicitGemmXdlops(const miopen::conv::ProblemDescription& problem, InvokerFactoryMakerNCHW&& invoker_factory_maker_ncdhw, - InvokerFactoryMakerNHWC&& invoker_factory_maker_ndhwc) + InvokerFactoryMakerNHWC&& invoker_factory_maker_ndhwc, + const bool use_tf32 = false) { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -1477,10 +1488,14 @@ MakeSolutionGroupConvImplicitGemmXdlops(const miopen::conv::ProblemDescription& { switch(problem.GetInDataType()) { - case miopenInt8: return invoker_factory_maker_ncdhw(int8_t{}); - case miopenHalf: return invoker_factory_maker_ncdhw(ck::half_t{}); - case miopenFloat: return invoker_factory_maker_ncdhw(float{}); - case miopenBFloat16: return invoker_factory_maker_ncdhw(ck::bhalf_t{}); + case miopenInt8: return invoker_factory_maker_ncdhw(int8_t{}, int8_t{}); + case miopenHalf: return invoker_factory_maker_ncdhw(ck::half_t{}, ck::half_t{}); + case miopenFloat: + if(use_tf32) + return invoker_factory_maker_ncdhw(float{}, ck::tf32_t{}); + else + return invoker_factory_maker_ncdhw(float{}, float{}); + case miopenBFloat16: return invoker_factory_maker_ncdhw(ck::bhalf_t{}, ck::bhalf_t{}); case miopenInt64: case miopenInt32: case miopenDouble: @@ -1496,10 +1511,14 @@ MakeSolutionGroupConvImplicitGemmXdlops(const miopen::conv::ProblemDescription& { switch(problem.GetInDataType()) { - case miopenInt8: return invoker_factory_maker_ndhwc(int8_t{}); - case miopenHalf: return invoker_factory_maker_ndhwc(ck::half_t{}); - case miopenFloat: return invoker_factory_maker_ndhwc(float{}); - case miopenBFloat16: return invoker_factory_maker_ndhwc(ck::bhalf_t{}); + case miopenInt8: return invoker_factory_maker_ndhwc(int8_t{}, int8_t{}); + case miopenHalf: return invoker_factory_maker_ndhwc(ck::half_t{}, ck::half_t{}); + case miopenFloat: + if(use_tf32) + return invoker_factory_maker_ndhwc(float{}, ck::tf32_t{}); + else + return invoker_factory_maker_ndhwc(float{}, float{}); + case miopenBFloat16: return invoker_factory_maker_ndhwc(ck::bhalf_t{}, ck::bhalf_t{}); case miopenInt64: case miopenInt32: case miopenDouble: diff --git a/projects/miopen/src/include/miopen/target_properties.hpp b/projects/miopen/src/include/miopen/target_properties.hpp index 83f7e1ff3851..cab04861a8e3 100644 --- a/projects/miopen/src/include/miopen/target_properties.hpp +++ b/projects/miopen/src/include/miopen/target_properties.hpp @@ -28,12 +28,19 @@ #include #include +#include +#include #define WORKAROUND_ISSUE_1204 1 // ROCm may incorrectly report "sramecc-" for gfx900. #define WORKAROUND_ISSUE_3001 1 namespace miopen { +static inline bool IsTF32Supported(const std::string& device_name) +{ + return device_name == "gfx942" || StartsWith(device_name, "gfx95"); +} + struct Handle; class TargetProperties diff --git a/projects/miopen/src/kernels/gpu_reference_kernel/naive_conv.cpp b/projects/miopen/src/kernels/gpu_reference_kernel/naive_conv.cpp index ff09bbff4a19..fc2d54389f7e 100644 --- a/projects/miopen/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/projects/miopen/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -71,7 +71,7 @@ inline __device__ __host__ ushort convert_fp32_to_bf16(float src_val) return target_val.ushortvec[1]; } -template +template inline __device__ __host__ dst_data_t cast_to(const src_data_t& val) { return static_cast(val); @@ -102,6 +102,25 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) return static_cast(val & 0xff); } +template <> +inline __device__ __host__ float cast_to(const float& val) +{ + union + { + float fp32; + uint32_t int32; + } u = {val}; + + u.int32 = u.int32 & 0xffffe000; + return u.fp32; +} + +template <> +inline __device__ __host__ double cast_to(const float& val) +{ + return static_cast(cast_to(val)); +} + inline __device__ __host__ bool IsZero(double val) { return val == 0.0; } inline __device__ __host__ bool IsOne(double val) { return val == 1.0; } @@ -141,7 +160,11 @@ inline __device__ void applyalphaBetaUpdate(dst_data_t* __restrict__ p_array, /// alpha and beta are double to ensure high precision. -template +template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -237,8 +260,8 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -250,8 +273,8 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, static_cast(iy) * wei_strides[1] + static_cast(ix) * wei_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -271,7 +294,11 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -372,8 +399,8 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, size_t f_idx = static_cast(ik) * c_per_group * fy * fx + static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -385,8 +412,8 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, static_cast(iy) * wei_strides[1] + static_cast(ix) * wei_strides[0]; - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -407,7 +434,11 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const double alpha, @@ -503,8 +534,8 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, size_t o_idx = static_cast(in) * k * ho * wo + static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } else { @@ -517,8 +548,8 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, static_cast(iho) * out_strides[1] + static_cast(iwo) * out_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } } } @@ -542,7 +573,12 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, } // design block_size 256 -template + +template inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -656,8 +692,8 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -671,8 +707,8 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, static_cast(iy) * wei_strides[1] + static_cast(ix) * wei_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -696,7 +732,11 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -819,8 +859,8 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, static_cast(iz) * fy * fx + static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -834,8 +874,8 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, static_cast(iy) * wei_strides[1] + static_cast(ix) * wei_strides[0]; - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -859,7 +899,11 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const double alpha, @@ -971,8 +1015,8 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } else { @@ -987,8 +1031,8 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, static_cast(iho) * out_strides[1] + static_cast(iwo) * out_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } } } @@ -1016,7 +1060,12 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, /***************************** nhwc *****************************/ // design block_size 256 -template + +template inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -1114,8 +1163,8 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, static_cast(iy) * fx * c_per_group + static_cast(ix) * c_per_group + static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -1130,8 +1179,8 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, static_cast(ix) * wei_strides[1] + static_cast(ic) * wei_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -1154,7 +1203,11 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -1256,8 +1309,8 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, static_cast(iy) * fx * c_per_group + static_cast(ix) * c_per_group + static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -1272,8 +1325,8 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, static_cast(ix) * wei_strides[1] + static_cast(ic) * wei_strides[0]; - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -1296,7 +1349,11 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const double alpha, @@ -1392,8 +1449,8 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, static_cast(iho) * wo * k + static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } else { @@ -1406,8 +1463,8 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, static_cast(iho) * out_strides[3] + static_cast(iwo) * out_strides[2]; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } } } @@ -1431,7 +1488,12 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, } // design block_size 256 -template + +template inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -1544,8 +1606,8 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, static_cast(iy) * fx * c_per_group + static_cast(ix) * c_per_group + static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -1560,8 +1622,8 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, static_cast(ix) * wei_strides[1] + static_cast(ic) * wei_strides[0]; - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -1585,7 +1647,11 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const double alpha, @@ -1706,8 +1772,8 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, static_cast(iy) * fx * c_per_group + static_cast(ix) * c_per_group + static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } else { @@ -1722,8 +1788,8 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, static_cast(ix) * wei_strides[1] + static_cast(ic) * wei_strides[0]; - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); } } } @@ -1747,7 +1813,11 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, } } -template +template inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const double alpha, @@ -1859,8 +1929,8 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, static_cast(iho) * wo * k + static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } else { @@ -1876,8 +1946,8 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, static_cast(iho) * out_strides[3] + static_cast(iwo) * out_strides[2]; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); } } } @@ -1903,297 +1973,323 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, } } -#define DEFINE_2D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_ab_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - double alpha, \ - double beta, \ - dst_data_t* __restrict__ p_out, \ - Strides5D in_strides, \ - Strides5D wei_strides, \ - Strides5D out_strides, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_##direction##_##tensor_layout( \ - p_in, \ - p_wei, \ - alpha, \ - beta, \ - p_out, \ - in_strides, \ - wei_strides, \ - out_strides, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } \ - extern "C" __global__ void \ - naive_conv_ab_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - double alpha, \ - double beta, \ - dst_data_t* __restrict__ p_out, \ - Strides5D in_strides, \ - Strides5D wei_strides, \ - Strides5D out_strides, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_##direction##_##tensor_layout( \ - p_in, \ - p_wei, \ - alpha, \ - beta, \ - p_out, \ - in_strides, \ - wei_strides, \ - out_strides, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ +#define DEFINE_2D_NAIVE_CONV_KERNEL( \ + direction, tensor_layout, src_data_t, acc_data_t, dst_data_t, use_tf32) \ + extern "C" __global__ void \ + naive_conv_ab_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t##_##use_tf32( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + double alpha, \ + double beta, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout(p_in, \ + p_wei, \ + alpha, \ + beta, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_ab_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t##_##use_tf32( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + double alpha, \ + double beta, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout(p_in, \ + p_wei, \ + alpha, \ + beta, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ } -#define DEFINE_3D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_ab_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - double alpha, \ - double beta, \ - dst_data_t* __restrict__ p_out, \ - Strides6D in_strides, \ - Strides6D wei_strides, \ - Strides6D out_strides, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_##direction##_##tensor_layout( \ - p_in, \ - p_wei, \ - alpha, \ - beta, \ - p_out, \ - in_strides, \ - wei_strides, \ - out_strides, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ - } \ - extern "C" __global__ void \ - naive_conv_ab_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - double alpha, \ - double beta, \ - dst_data_t* __restrict__ p_out, \ - Strides6D in_strides, \ - Strides6D wei_strides, \ - Strides6D out_strides, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_##direction##_##tensor_layout( \ - p_in, \ - p_wei, \ - alpha, \ - beta, \ - p_out, \ - in_strides, \ - wei_strides, \ - out_strides, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ +#define DEFINE_3D_NAIVE_CONV_KERNEL( \ + direction, tensor_layout, src_data_t, acc_data_t, dst_data_t, use_tf32) \ + extern "C" __global__ void \ + naive_conv_ab_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t##_##use_tf32( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + double alpha, \ + double beta, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout(p_in, \ + p_wei, \ + alpha, \ + beta, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_ab_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t##_##use_tf32( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + double alpha, \ + double beta, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout(p_in, \ + p_wei, \ + alpha, \ + beta, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ } -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int8_t) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int8_t) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float) - -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort) - -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float) -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half) -DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float) - -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float) -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half) -DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int8_t, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int8_t, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float, 0) + +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort, 0) + +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float, 1) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half, 0) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort, 0) + +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float, 0) + +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort, 0) + +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float, 1) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half, 0) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort, 0) /// \todo discuss whether we should split the kernels into separate files, or /// figure out a mechanism to compile each kernel separately to reduce hipRTC diff --git a/projects/miopen/src/ocl/convolutionocl.cpp b/projects/miopen/src/ocl/convolutionocl.cpp index 4c99bae522b7..2a5bdd0b3461 100644 --- a/projects/miopen/src/ocl/convolutionocl.cpp +++ b/projects/miopen/src/ocl/convolutionocl.cpp @@ -194,6 +194,7 @@ static Invoker PrepareInvoker(ExecutionContext ctx, solver::Id solver_id) { problem.SetupFloats(ctx); + problem.SetupComputeType(ctx); ctx.do_search = false; ctx.disable_search_enforce = true; @@ -599,6 +600,7 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(const Handle& handle, const auto ctx = [&] { auto tmp = ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); tmp.do_search = exhaustiveSearch; return tmp; }(); @@ -817,6 +819,8 @@ void ConvolutionDescriptor::ConvolutionForward(const Handle& handle, const auto problem = conv::ProblemDescription{ xDesc, wDesc, yDesc, *this, conv::Direction::Forward, 0, alpha_val, beta_val}; ValidateAlphaBeta(problem); + auto ctx = ExecutionContext{&handle}; + problem.SetupComputeType(ctx); ConvForwardCheckNumerics(handle, tensors, [&]() { Problem::ValidateGroupCount(xDesc, wDesc, *this); @@ -1061,6 +1065,7 @@ std::size_t ConvolutionDescriptor::GetForwardSolutionWorkspaceSize(const Handle& conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward}; auto ctx = ExecutionContext{}; ctx.SetStream(&handle); + problem.SetupComputeType(ctx); if(sol.IsApplicable(ctx, problem)) return sol.GetWorkspaceSize(ctx, problem); MIOPEN_THROW(miopenStatusBadParm, @@ -1098,7 +1103,8 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(const Handle& handle, ConvForwardCheckNumerics(handle, tensors, [&]() { const auto problem = conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward}; - const auto ctx = ExecutionContext{&handle}; + const auto ctx = ExecutionContext{&handle}; + problem.SetupComputeType(ctx); const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id); const auto invoke_ctx = conv::DataInvokeParams{ tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetFwd()}; @@ -1143,6 +1149,7 @@ void ConvolutionDescriptor::FindConvBwdDataAlgorithm(const Handle& handle, const auto ctx = [&] { auto tmp = ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); tmp.do_search = exhaustiveSearch; return tmp; }(); @@ -1222,6 +1229,8 @@ void ConvolutionDescriptor::ConvolutionBackwardData(const Handle& handle, const auto problem = conv::ProblemDescription{ dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData, 0, alpha_val, beta_val}; ValidateAlphaBeta(problem); + ExecutionContext ctx{&handle}; + problem.SetupComputeType(ctx); ConvBwdCheckNumerics(handle, tensors, beta, [&]() { if(dyDesc.GetLengths()[1] != wDesc.GetLengths()[0]) @@ -1266,6 +1275,7 @@ std::size_t ConvolutionDescriptor::GetBackwardSolutionWorkspaceSize(const Handle conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData}; auto ctx = ExecutionContext{}; ctx.SetStream(&handle); + problem.SetupComputeType(ctx); if(sol.IsApplicable(ctx, problem)) { return sol.GetWorkspaceSize(ctx, problem); @@ -1305,7 +1315,8 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(const Handle& handle, const auto problem = conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData}; - const auto ctx = ExecutionContext{&handle}; + const auto ctx = ExecutionContext{&handle}; + problem.SetupComputeType(ctx); const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id); const auto invoke_ctx = conv::DataInvokeParams{ tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetBwd()}; @@ -1350,6 +1361,7 @@ void ConvolutionDescriptor::FindConvBwdWeightsAlgorithm(const Handle& handle, const auto ctx = [&] { auto tmp = ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); tmp.do_search = exhaustiveSearch; return tmp; }(); @@ -1428,6 +1440,8 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle, decltype(auto) problem = conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, direction, 0, alpha_val, beta_val}; ValidateAlphaBeta(problem); + ExecutionContext ctx{&handle}; + problem.SetupComputeType(ctx); if(xDesc.GetType() == miopenInt8) MIOPEN_THROW(miopenStatusBadParm); @@ -1470,6 +1484,7 @@ std::size_t ConvolutionDescriptor::GetWrwSolutionWorkspaceSize(const Handle& han conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights}; auto ctx = ExecutionContext{}; ctx.SetStream(&handle); + problem.SetupComputeType(ctx); if(sol.IsApplicable(ctx, problem)) { return sol.GetWorkspaceSize(ctx, problem); @@ -1507,7 +1522,8 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(const Handle& handle, const auto problem = conv::ProblemDescription{ dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights}; - const auto ctx = ExecutionContext{&handle}; + const auto ctx = ExecutionContext{&handle}; + problem.SetupComputeType(ctx); const auto invoker = LoadOrPrepareInvoker(ctx, problem, solver_id); const auto invoke_ctx = conv::WrWInvokeParams{ tensors, workSpace, workSpaceSize, this->attribute.gfx90aFp16alt.GetWrW()}; @@ -1515,6 +1531,11 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(const Handle& handle, }); } +miopenMathType_t ConvolutionDescriptor::GetMathType() const +{ + return static_cast(this->attribute.Get(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)); +} + void ConvolutionBackwardBias(const Handle& handle, const void* alpha, const TensorDescriptor& dyDesc, @@ -1610,4 +1631,19 @@ void ConvolutionBackwardBias(const Handle& handle, } } +bool EnvEnableTF32() +{ + // disable TF32 by default temporarily until we fully complete this feature. + // so either one is set to true, we enable TF32 + // TODO:(LYM) change back to default enabled + bool bool_miopen = miopen::env::enabled(MIOPEN_TF32_OVERRIDE); + bool bool_nvidia = miopen::env::enabled(NVIDIA_TF32_OVERRIDE); + if(bool_miopen != bool_nvidia) + MIOPEN_LOG_I2("TF32_OVERRIDE is set to different values for MIOPEN_TF32_OVERRIDE (" + << bool_miopen << ") and NVIDIA_TF32_OVERRIDE (" << bool_nvidia + << "). TF32 is currently treated as enabled (temporary; may be changed to " + "disabled in future)."); + return bool_miopen || bool_nvidia; +} + } // namespace miopen diff --git a/projects/miopen/src/problem.cpp b/projects/miopen/src/problem.cpp index a89668d9841a..f82db9d6ea8d 100644 --- a/projects/miopen/src/problem.cpp +++ b/projects/miopen/src/problem.cpp @@ -529,6 +529,7 @@ std::vector Problem::FindSolutionsImpl(const Handle& handle, auto ctx = ExecutionContext{&handle}; conv_problem.SetupFloats(ctx); + conv_problem.SetupComputeType(ctx); ctx.do_search = options.exhaustive_search; const auto invoke_ctx = diff --git a/projects/miopen/src/solution.cpp b/projects/miopen/src/solution.cpp index 7a41885874df..1d4357f2d823 100644 --- a/projects/miopen/src/solution.cpp +++ b/projects/miopen/src/solution.cpp @@ -231,6 +231,7 @@ void Solution::RunImpl(const Handle& handle, { auto ctx = ExecutionContext{&handle}; conv_problem.SetupFloats(ctx); + conv_problem.SetupComputeType(ctx); const auto invoker_factory = GetSolver().GetSolver().GetInvokeFactory(ctx, conv_problem, perf_cfg.value_or("")); auto kernel_handles = std::vector{std::begin(kernels), std::end(kernels)}; @@ -254,6 +255,7 @@ void Solution::RunImpl(const Handle& handle, auto conv_ctx = ExecutionContext{&handle}; conv_problem.SetupFloats(conv_ctx); + conv_problem.SetupComputeType(conv_ctx); decltype(auto) db = MakeConvDbGetter(conv_ctx); const auto conv_solution = GetSolver().GetSolver().FindSolution( diff --git a/projects/miopen/src/solver/conv/conv_MP_bidirectional_winograd.cpp b/projects/miopen/src/solver/conv/conv_MP_bidirectional_winograd.cpp index e599d1cde67c..ec27ce8910d0 100644 --- a/projects/miopen/src/solver/conv/conv_MP_bidirectional_winograd.cpp +++ b/projects/miopen/src/solver/conv/conv_MP_bidirectional_winograd.cpp @@ -744,6 +744,7 @@ ExecutionContext ConvMPBidirectWinograd_xdlops(use_tf32); + return kernel_name.str(); } diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp index a81247e6e508..e766706e0e33 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp @@ -63,7 +63,7 @@ using Bilinear = ck::tensor_operation::element_wise: using Scale = ck::tensor_operation::element_wise::Scale; static constexpr ck::index_t NumDimSpatial = 3; -template +template using DeviceOpGBwdBilinear = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD; + Bilinear, + ComputeType>; -template +template using DeviceOpGBwdScale = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD; + Scale, + ComputeType>; -template +template using DeviceOpGBwdDefault = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD; + ComputeType>; -template +template using DeviceOpGBwdBilinearPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdBilinear>; + DeviceOpGBwdBilinear>; -template +template using DeviceOpGBwdScalePtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdScale>; + DeviceOpGBwdScale>; -template +template using DeviceOpGBwdDefaultPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGBwdDefault>; + DeviceOpGBwdDefault>; namespace { -template +template struct CKArgs { CKArgs(const ::miopen::conv::ProblemDescription& problem) @@ -211,11 +212,11 @@ struct CKArgs float beta) const { using DeviceP = std::remove_pointer_t; - if constexpr(std::is_same_v>) + if constexpr(std::is_same_v>) { return MakeBilinearArgPtr(conv_ptr, in, w, out, alpha, beta); } - else if constexpr(std::is_same_v>) + else if constexpr(std::is_same_v>) { (void)beta; return MakeScaleArgPtr(conv_ptr, in, w, out, alpha); @@ -224,7 +225,7 @@ struct CKArgs { (void)alpha; (void)beta; - static_assert(std::is_same_v>, + static_assert(std::is_same_v>, "Default should be bwd pass through"); return MakeDefaultArgPtr(conv_ptr, in, w, out); } @@ -351,18 +352,21 @@ struct CKArgs std::array rPadding; }; -template +template std::vector FillValidKernelsByAlphaBeta(const ::miopen::conv::ProblemDescription& problem) { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return FillValidKernelsIDs, CKArgs>(problem); + return FillValidKernelsIDs, + CKArgs>(problem); case SCALE: - return FillValidKernelsIDs, CKArgs>(problem); + return FillValidKernelsIDs, + CKArgs>(problem); default: - return FillValidKernelsIDs, CKArgs>(problem); + return FillValidKernelsIDs, + CKArgs>(problem); } } } // namespace @@ -389,43 +393,51 @@ std::vector GetAllBwdKernelTypeStrings() return all_kernels; } -template -void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::Init( +template +bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::Init( const ::miopen::conv::ProblemDescription& problem) { - valid_kernels = FillValidKernelsByAlphaBeta(problem); - index = 0; - kernel_id = valid_kernels[index]; + valid_kernels = FillValidKernelsByAlphaBeta(problem); + if(valid_kernels.empty()) + return false; + index = 0; + kernel_id = valid_kernels[index]; + return true; } -template +template bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::CheckIsSupportCKArgs( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); case SCALE: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); default: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); } } -template +template bool ConvHipImplicitGemm3DGroupBwdXdlops::CheckCKApplicability( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKApplicable, CKArgs>(problem); - case SCALE: return IsCKApplicable, CKArgs>(problem); - default: return IsCKApplicable, CKArgs>(problem); + return IsCKApplicable, + CKArgs>(problem); + case SCALE: + return IsCKApplicable, + CKArgs>(problem); + default: + return IsCKApplicable, + CKArgs>(problem); } } @@ -435,7 +447,17 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::InitValidKernels( switch(problem.GetInDataType()) { case miopenHalf: Init(problem); break; - case miopenFloat: Init(problem); break; + case miopenFloat: + if(problem.UseTF32() && Init(problem)) + { + use_tf32 = true; + } + else + { + use_tf32 = false; + Init(problem); + } + break; case miopenInt8: Init(problem); break; case miopenBFloat16: Init(problem); break; default: break; // Unsupported data types - valid_kernels remains empty @@ -464,11 +486,12 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit( bool ai_success = false; miopen::ai::tuning::candidate_selection::CandidateSelectionResult result; - auto run_ai_heuristics = [&](auto CKDataType) { - using T = decltype(CKDataType); + auto run_ai_heuristics = [&](auto CKDataType, auto CKComputeType) { + using T = decltype(CKDataType); + using TCompute = decltype(CKComputeType); auto fill_valid_kernels = [=](const ::miopen::conv::ProblemDescription& problem) -> std::vector { - return FillValidKernelsByAlphaBeta(problem); + return FillValidKernelsByAlphaBeta(problem); }; // Validation lambda for AI-predicted kernel + split_k combinations // Note: This solver currently doesn't use split_k (always 0), but validation @@ -497,9 +520,27 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit( }; switch(problem.GetInDataType()) { - case miopenHalf: std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}); break; - case miopenFloat: std::tie(ai_success, result) = run_ai_heuristics(float{}); break; - case miopenBFloat16: std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}); break; + case miopenHalf: + std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}, ck::half_t{}); + break; + case miopenFloat: + if(problem.UseTF32()) + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, ck::tf32_t{}); + if(!ai_success || result.IsEmpty()) + { + MIOPEN_LOG_I2("Step 1: AI heuristics with TF32 failed, retrying with FP32"); + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + } + else + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + break; + case miopenBFloat16: + std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}, ck::bhalf_t{}); + break; default: break; } @@ -571,7 +612,17 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid( switch(problem.GetInDataType()) { case miopenHalf: return CheckIsSupportCKArgs(problem); - case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenFloat: + if(problem.UseTF32() && CheckIsSupportCKArgs(problem)) + { + use_tf32 = true; + return true; + } + else + { + use_tf32 = false; + return CheckIsSupportCKArgs(problem); + } case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenBFloat16: return CheckIsSupportCKArgs(problem); case miopenInt64: @@ -648,7 +699,15 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( switch(problem.GetInDataType()) { case miopenHalf: return CheckCKApplicability(problem); - case miopenFloat: return CheckCKApplicability(problem); + case miopenFloat: + if(problem.UseTF32() && CheckCKApplicability(problem)) + { + return true; + } + else + { + return CheckCKApplicability(problem); + } case miopenInt8: return CheckCKApplicability(problem); case miopenBFloat16: return CheckCKApplicability(problem); case miopenInt64: @@ -669,57 +728,60 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryBwdNCHW<3, false, - DeviceOpGBwdBilinearPtrs, - CKArgs, + DeviceOpGBwdBilinearPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryBwdNCHW<3, false, - DeviceOpGBwdScalePtrs, - CKArgs, + DeviceOpGBwdScalePtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryBwdNCHW<3, false, - DeviceOpGBwdDefaultPtrs, - CKArgs, + DeviceOpGBwdDefaultPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); } }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdBilinearPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdScalePtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdDefaultPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); } - }); + }, + config.UseTF32()); #else return {}; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 10ef1d0998b2..5637504cefb8 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -64,7 +64,7 @@ using Bilinear = ck::tensor_operation::element_wise: using Scale = ck::tensor_operation::element_wise::Scale; static constexpr ck::index_t NumDimSpatial = 3; -template +template using DeviceOpGFwdBilinear = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; + Bilinear, + ComputeType, + ComputeType>; -template +template using DeviceOpGFwdScale = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; - -template + Scale, + ComputeType, + ComputeType>; +template using DeviceOpGFwdDefault = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD; + PassThrough, + ComputeType, + ComputeType>; -template +template using DeviceOpGFwdBilinearPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdBilinear>; + DeviceOpGFwdBilinear>; -template +template using DeviceOpGFwdScalePtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdScale>; + DeviceOpGFwdScale>; -template +template using DeviceOpGFwdDefaultPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdDefault>; + DeviceOpGFwdDefault>; namespace { -template +template struct CKArgs { CKArgs(const ::miopen::conv::ProblemDescription& problem) @@ -206,11 +211,11 @@ struct CKArgs float beta) const { using DeviceP = std::remove_pointer_t; - if constexpr(std::is_same_v>) + if constexpr(std::is_same_v>) { return MakeBilinearArgPtr(conv_ptr, in, w, out, alpha, beta); } - else if constexpr(std::is_same_v>) + else if constexpr(std::is_same_v>) { (void)beta; return MakeScaleArgPtr(conv_ptr, in, w, out, alpha); @@ -219,7 +224,7 @@ struct CKArgs { (void)alpha; (void)beta; - static_assert(std::is_same_v>, + static_assert(std::is_same_v>, "Default should be fwd pass through"); return MakeDefaultArgPtr(conv_ptr, in, w, out); } @@ -347,21 +352,21 @@ struct CKArgs miopenAlphaBetaCase_t alpha_beta_case; }; -template +template std::vector FillValidKernelsByAlphaBeta(const ::miopen::conv::ProblemDescription& problem) { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return miopen::solver::FillValidKernelsIDs, - CKArgs>(problem); + return miopen::solver::FillValidKernelsIDs, + CKArgs>(problem); case SCALE: - return miopen::solver::FillValidKernelsIDs, - CKArgs>(problem); + return miopen::solver::FillValidKernelsIDs, + CKArgs>(problem); default: - return miopen::solver::FillValidKernelsIDs, - CKArgs>(problem); + return miopen::solver::FillValidKernelsIDs, + CKArgs>(problem); } } } // namespace @@ -388,43 +393,51 @@ std::vector GetAllFwdKernelTypeStrings() return all_kernels; } -template -void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init( - const ::miopen::conv::ProblemDescription& problem) +template +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init( + const miopen::conv::ProblemDescription& problem) { - valid_kernels = FillValidKernelsByAlphaBeta(problem); - index = 0; - kernel_id = valid_kernels[index]; + valid_kernels = FillValidKernelsByAlphaBeta(problem); + if(valid_kernels.empty()) + return false; + index = 0; + kernel_id = valid_kernels[index]; + return true; } -template +template bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::CheckIsSupportCKArgs( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); case SCALE: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); default: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); } } -template +template bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKApplicable, CKArgs>(problem); - case SCALE: return IsCKApplicable, CKArgs>(problem); - default: return IsCKApplicable, CKArgs>(problem); + return IsCKApplicable, + CKArgs>(problem); + case SCALE: + return IsCKApplicable, + CKArgs>(problem); + default: + return IsCKApplicable, + CKArgs>(problem); } } @@ -434,7 +447,17 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::InitValidKernels( switch(problem.GetInDataType()) { case miopenHalf: Init(problem); break; - case miopenFloat: Init(problem); break; + case miopenFloat: + if(problem.UseTF32() && Init(problem)) + { + use_tf32 = true; + } + else + { + use_tf32 = false; + Init(problem); + } + break; case miopenInt8: Init(problem); break; case miopenBFloat16: Init(problem); break; default: break; // Unsupported data types - valid_kernels remains empty @@ -588,11 +611,12 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( bool ai_success = false; miopen::ai::tuning::candidate_selection::CandidateSelectionResult result; - auto run_ai_heuristics = [&](auto CKDataType) { - using T = decltype(CKDataType); + auto run_ai_heuristics = [&](auto CKDataType, auto CKComputeType) { + using T = decltype(CKDataType); + using TCompute = decltype(CKComputeType); auto fill_valid_kernels = [=](const ::miopen::conv::ProblemDescription& problem) -> std::vector { - return FillValidKernelsByAlphaBeta(problem); + return FillValidKernelsByAlphaBeta(problem); }; // Validation lambda for AI-predicted kernel + split_k combinations // Note: This solver currently doesn't use split_k (always 0), but validation @@ -621,9 +645,27 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( }; switch(problem.GetInDataType()) { - case miopenHalf: std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}); break; - case miopenFloat: std::tie(ai_success, result) = run_ai_heuristics(float{}); break; - case miopenBFloat16: std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}); break; + case miopenHalf: + std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}, ck::half_t{}); + break; + case miopenFloat: + if(problem.UseTF32()) + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, ck::tf32_t{}); + if(!ai_success || result.IsEmpty()) + { + MIOPEN_LOG_I2("Step 3: AI heuristics with TF32 failed, retrying with FP32"); + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + } + else + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + break; + case miopenBFloat16: + std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}, ck::bhalf_t{}); + break; default: break; } if(ai_success && !result.IsEmpty()) @@ -694,7 +736,17 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid( switch(problem.GetInDataType()) { case miopenHalf: return CheckIsSupportCKArgs(problem); - case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenFloat: + if(problem.UseTF32() && CheckIsSupportCKArgs(problem)) + { + use_tf32 = true; + return true; + } + else + { + use_tf32 = false; + return CheckIsSupportCKArgs(problem); + } case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenBFloat16: return CheckIsSupportCKArgs(problem); case miopenInt64: @@ -771,7 +823,15 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( switch(problem.GetInDataType()) { case miopenHalf: return CheckCKApplicability(problem); - case miopenFloat: return CheckCKApplicability(problem); + case miopenFloat: + if(problem.UseTF32() && CheckCKApplicability(problem)) + { + return true; + } + else + { + return CheckCKApplicability(problem); + } case miopenInt8: return CheckCKApplicability(problem); case miopenBFloat16: return CheckCKApplicability(problem); case miopenInt64: @@ -820,57 +880,60 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryFwdNCHW<3, false, - DeviceOpGFwdBilinearPtrs, - CKArgs, + DeviceOpGFwdBilinearPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryFwdNCHW<3, false, - DeviceOpGFwdScalePtrs, - CKArgs, + DeviceOpGFwdScalePtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryFwdNCHW<3, false, - DeviceOpGFwdDefaultPtrs, - CKArgs, + DeviceOpGFwdDefaultPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); } }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGFwdBilinearPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGFwdScalePtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGFwdDefaultPtrs, + CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); } - }); + }, + config.UseTF32()); #else return {}; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 29ecdea74c27..320eb85026de 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -55,7 +55,7 @@ namespace conv { namespace { -template +template struct CKArgs { CKArgs(const ::miopen::conv::ProblemDescription& problem) @@ -141,11 +141,11 @@ struct CKArgs int split_k) const { using DeviceP = std::remove_pointer_t; - if constexpr(std::is_same_v>) + if constexpr(std::is_same_v>) { return MakeBilinearArgPtr(conv_ptr, x, dw, dy, alpha, beta, split_k); } - else if constexpr(std::is_same_v>) + else if constexpr(std::is_same_v>) { (void)beta; return MakeScaleArgPtr(conv_ptr, x, dw, dy, alpha, split_k); @@ -154,7 +154,7 @@ struct CKArgs { (void)alpha; (void)beta; - static_assert(std::is_same_v>, + static_assert(std::is_same_v>, "Default should be wrw pass through"); return MakeDefaultArgPtr(conv_ptr, x, dw, dy, split_k); } @@ -311,21 +311,21 @@ struct CKArgs std::array rPadding; }; -template +template std::vector FillValidKernelsByAlphaBeta(const ::miopen::conv::ProblemDescription& problem) { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return FillValidKernelsIDs, CKArgs>( - problem); + return FillValidKernelsIDs, + CKArgs>(problem); case SCALE: - return FillValidKernelsIDs, CKArgs>( - problem); + return FillValidKernelsIDs, + CKArgs>(problem); default: - return FillValidKernelsIDs, CKArgs>( - problem); + return FillValidKernelsIDs, + CKArgs>(problem); } } } // namespace @@ -352,46 +352,52 @@ std::vector GetAllWrwKernelTypeStrings() return all_kernels; } -template -void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::Init( +template +bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::Init( const ::miopen::conv::ProblemDescription& problem) { - valid_kernels = FillValidKernelsByAlphaBeta(problem); - index = 0; - split_k = 1; - kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); + valid_kernels = FillValidKernelsByAlphaBeta(problem); + if(valid_kernels.empty()) + return false; + index = 0; + split_k = 1; + kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); + return true; } -template +template bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::CheckIsSupportCKArgs( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKArgsSupported, CKArgs>( - problem, kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); case SCALE: - return IsCKArgsSupported, CKArgs>( - problem, kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); default: - return IsCKArgsSupported, CKArgs>( - problem, kernel_id); + return IsCKArgsSupported, + CKArgs>(problem, kernel_id); } } -template +template bool ConvHipImplicitGemm3DGroupWrwXdlops::CheckCKApplicability( const ::miopen::conv::ProblemDescription& problem) const { switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return IsCKApplicable, CKArgs>(problem); + return IsCKApplicable, + CKArgs>(problem); case SCALE: - return IsCKApplicable, CKArgs>(problem); + return IsCKApplicable, + CKArgs>(problem); default: - return IsCKApplicable, CKArgs>(problem); + return IsCKApplicable, + CKArgs>(problem); } } void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::InitValidKernels( @@ -400,7 +406,17 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::InitValidKernels( switch(problem.GetInDataType()) { case miopenHalf: Init(problem); break; - case miopenFloat: Init(problem); break; + case miopenFloat: + if(problem.UseTF32() && Init(problem)) + { + use_tf32 = true; + } + else + { + use_tf32 = false; + Init(problem); + } + break; case miopenInt8: Init(problem); break; case miopenBFloat16: Init(problem); break; default: break; // Unsupported data types - valid_kernels remains empty @@ -430,11 +446,12 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( bool ai_success = false; miopen::ai::tuning::candidate_selection::CandidateSelectionResult result; - auto run_ai_heuristics = [&](auto CKDataType) { - using T = decltype(CKDataType); + auto run_ai_heuristics = [&](auto CKDataType, auto CKComputeType) { + using T = decltype(CKDataType); + using TCompute = decltype(CKComputeType); auto fill_valid_kernels = [=](const ::miopen::conv::ProblemDescription& problem) -> std::vector { - return FillValidKernelsByAlphaBeta(problem); + return FillValidKernelsByAlphaBeta(problem); }; // Validation lambda for AI-predicted kernel + split_k combinations @@ -476,9 +493,27 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( }; switch(problem.GetInDataType()) { - case miopenHalf: std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}); break; - case miopenFloat: std::tie(ai_success, result) = run_ai_heuristics(float{}); break; - case miopenBFloat16: std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}); break; + case miopenHalf: + std::tie(ai_success, result) = run_ai_heuristics(ck::half_t{}, ck::half_t{}); + break; + case miopenFloat: + if(problem.UseTF32()) + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, ck::tf32_t{}); + if(!ai_success || result.IsEmpty()) + { + MIOPEN_LOG_I2("Step 3: AI heuristics with TF32 failed, retrying with FP32"); + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + } + else + { + std::tie(ai_success, result) = run_ai_heuristics(float{}, float{}); + } + break; + case miopenBFloat16: + std::tie(ai_success, result) = run_ai_heuristics(ck::bhalf_t{}, ck::bhalf_t{}); + break; default: break; } @@ -569,7 +604,17 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid( switch(problem.GetInDataType()) { case miopenHalf: return CheckIsSupportCKArgs(problem); - case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenFloat: + if(problem.UseTF32() && CheckIsSupportCKArgs(problem)) + { + use_tf32 = true; + return true; + } + else + { + use_tf32 = false; + return CheckIsSupportCKArgs(problem); + } case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenBFloat16: return CheckIsSupportCKArgs(problem); case miopenInt64: @@ -605,7 +650,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig( return config.IsValid(problem); } -template +template size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize( const ::miopen::conv::ProblemDescription& problem) const { @@ -613,14 +658,14 @@ size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize( switch(problem.GetAlphaBetaCase()) { case BILINEAR: - return GetCKSplitkMaxWorkspaceSize, - CKArgs>(problem); + return GetCKSplitkMaxWorkspaceSize, + CKArgs>(problem); case SCALE: - return GetCKSplitkMaxWorkspaceSize, CKArgs>( - problem); + return GetCKSplitkMaxWorkspaceSize, + CKArgs>(problem); default: - return GetCKSplitkMaxWorkspaceSize, - CKArgs>(problem); + return GetCKSplitkMaxWorkspaceSize, + CKArgs>(problem); } #else return 0; @@ -634,7 +679,9 @@ size_t ConvHipImplicitGemm3DGroupWrwXdlops::GetCKMaxWorkspaceSize( switch(problem.GetInDataType()) { case miopenHalf: return GetCKMaxWorkspaceSize(problem); - case miopenFloat: return GetCKMaxWorkspaceSize(problem); + case miopenFloat: + // fp32 and tf32 use same workspace size + return GetCKMaxWorkspaceSize(problem); case miopenInt8: return GetCKMaxWorkspaceSize(problem); case miopenBFloat16: return GetCKMaxWorkspaceSize(problem); case miopenInt64: @@ -689,7 +736,15 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( switch(problem.GetInDataType()) { case miopenHalf: return CheckCKApplicability(problem); - case miopenFloat: return CheckCKApplicability(problem); + case miopenFloat: + if(problem.UseTF32() && CheckCKApplicability(problem)) + { + return true; + } + else + { + return CheckCKApplicability(problem); + } case miopenInt8: return CheckCKApplicability(problem); case miopenBFloat16: return CheckCKApplicability(problem); case miopenInt64: @@ -710,57 +765,60 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryWrwNCHW<3, false, - DeviceOpGBwdWeightBilinearPtrs, - CKArgs, + DeviceOpGBwdWeightBilinearPtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryWrwNCHW<3, false, - DeviceOpGBwdWeightScalePtrs, - CKArgs, + DeviceOpGBwdWeightScalePtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryWrwNCHW<3, false, - DeviceOpGBwdWeightDefaultPtrs, - CKArgs, + DeviceOpGBwdWeightDefaultPtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); } }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); switch(problem.GetAlphaBetaCase()) { case BILINEAR: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdWeightBilinearPtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); case SCALE: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdWeightScalePtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); default: return InitInvokerFactoryNHWC, - CKArgs, + DeviceOpGBwdWeightDefaultPtrs, + CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); } - }); + }, + config.UseTF32()); #else return {}; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp index b60d062219eb..5c1877fd5243 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_bwd_xdlops.cpp @@ -220,16 +220,43 @@ struct CKArgs template void PerformanceConfigHipImplicitGemmGroupBwdXdlops::Init(const ProblemDescription& problem) { - valid_kernels = FillValidKernelsIDs, CKArgs>(problem); - index = 0; - split_k = 1; - kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + use_tf32 = true; + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + else + { + use_tf32 = false; + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + } + else + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + index = 0; + split_k = 1; + kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } template bool PerformanceConfigHipImplicitGemmGroupBwdXdlops::CheckIsSupportCKArgs( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKArgsSupported, CKArgs>(problem, kernel_id)) + { + use_tf32 = true; + return true; + } + use_tf32 = false; + } return IsCKArgsSupported, CKArgs>(problem, kernel_id); } @@ -237,6 +264,14 @@ template bool ConvHipImplicitGemmGroupBwdXdlops::CheckCKApplicability( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKApplicable, CKArgs>(problem)) + { + return true; + } + } return IsCKApplicable, CKArgs>(problem); } @@ -363,8 +398,20 @@ template bool PerformanceConfigHipImplicitGemmGroupBwdXdlops::RunParameterPredictionModel( const ExecutionContext& ctx, const ProblemDescription& problem) { - valid_kernels = FillValidKernelsIDs, CKArgs>( - problem); // filter valid_kernel ID's + // filter valid_kernel ID's + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + } + if(valid_kernels.empty()) + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + InitHeuristicKernelIDs(); static const std::string& arch = ctx.GetStream().GetDeviceName(); static std::string solver = "ConvHipIgemmGroupBwdXdlops"; @@ -620,23 +667,26 @@ ConvSolution ConvHipImplicitGemmGroupBwdXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryBwdNCHW<2, false, - DeviceOpGBwdPtrs, + DeviceOpGBwdPtrs, CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryNHWC, + DeviceOpGBwdPtrs, CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); - }); + }, + config.UseTF32()); #else return {}; diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp index 51f49e1a3a97..5eaf1988a5d6 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp @@ -48,7 +48,7 @@ namespace conv { using ProblemDescription = miopen::conv::ProblemDescription; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL -template +template using DeviceOpGFwd = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< 2, ck::tensor_layout::convolution::NHWGC, @@ -61,11 +61,12 @@ using DeviceOpGFwd = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleA DataType, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough>; + ck::tensor_operation::element_wise::PassThrough, + ComputeType>; -template -using DeviceOpGFwdPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory>; +template +using DeviceOpGFwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGFwd>; namespace { struct CKArgs @@ -185,7 +186,26 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::Init( const ProblemDescription& problem) // should be parameterized with execution context { if(valid_kernels.empty()) - valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + { + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + use_tf32 = true; + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + if(valid_kernels.empty()) + { + use_tf32 = false; + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + } + else + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + } index = 0; kernel_id = valid_kernels[index]; } @@ -194,6 +214,16 @@ template bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::CheckIsSupportCKArgs( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKArgsSupported, CKArgs>(problem, kernel_id)) + { + use_tf32 = true; + return true; + } + use_tf32 = false; + } return IsCKArgsSupported, CKArgs>(problem, kernel_id); } @@ -201,6 +231,14 @@ template bool ConvHipImplicitGemmGroupFwdXdlops::CheckCKApplicability( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKApplicable, CKArgs>(problem)) + { + return true; + } + } return IsCKApplicable, CKArgs>(problem); } @@ -326,8 +364,19 @@ template bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::RunParameterPredictionModel( const ExecutionContext& ctx, const ProblemDescription& problem) { - valid_kernels = FillValidKernelsIDs, CKArgs>( - problem); // filter valid_kernel ID's + // filter valid_kernel ID's + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + } + if(valid_kernels.empty()) + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } static const std::string& arch = ctx.GetStream().GetDeviceName(); if(arch == "gfx90a") InitHeuristicKernelIDs("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"); @@ -555,23 +604,26 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryFwdNCHW<2, false, - DeviceOpGFwdPtrs, + DeviceOpGFwdPtrs, CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryNHWC, + DeviceOpGFwdPtrs, CKArgs, miopen::conv::DataInvokeParams>( ctx, problem, config.kernel_id); - }); + }, + config.UseTF32()); #else return {}; #endif diff --git a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp index 608922e498d6..18917279ce8a 100644 --- a/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp +++ b/projects/miopen/src/solver/conv/conv_hip_implicit_gemm_grouped_wrw_xdlops.cpp @@ -49,9 +49,9 @@ using ProblemDescription = miopen::conv::ProblemDescription; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL -template -using DeviceOpGWrwPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory>; +template +using DeviceOpGWrwPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGWrw>; namespace { @@ -220,16 +220,43 @@ struct CKArgs template void PerformanceConfigHipImplicitGemmGroupWrwXdlops::Init(const ProblemDescription& problem) { - valid_kernels = FillValidKernelsIDs, CKArgs>(problem); - index = 0; - split_k = 1; - kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + use_tf32 = true; + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + else + { + use_tf32 = false; + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + } + else + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + index = 0; + split_k = 1; + kernel_id = valid_kernels[index] + "+" + std::to_string(split_k); } template bool PerformanceConfigHipImplicitGemmGroupWrwXdlops::CheckIsSupportCKArgs( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKArgsSupported, CKArgs>(problem, kernel_id)) + { + use_tf32 = true; + return true; + } + use_tf32 = false; + } return IsCKArgsSupported, CKArgs>(problem, kernel_id); } @@ -237,6 +264,14 @@ template bool ConvHipImplicitGemmGroupWrwXdlops::CheckCKApplicability( const ProblemDescription& problem) const { + if constexpr(std::is_same_v) + { + if(problem.UseTF32() && + IsCKApplicable, CKArgs>(problem)) + { + return true; + } + } return IsCKApplicable, CKArgs>(problem); } @@ -395,8 +430,20 @@ template bool PerformanceConfigHipImplicitGemmGroupWrwXdlops::RunParameterPredictionModel( const ExecutionContext& ctx, const ProblemDescription& problem) { - valid_kernels = FillValidKernelsIDs, CKArgs>( - problem); // filter valid_kernel ID's + // filter valid_kernel ID's + if constexpr(std::is_same_v) + { + if(problem.UseTF32()) + { + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + } + } + if(valid_kernels.empty()) + { + valid_kernels = FillValidKernelsIDs, CKArgs>(problem); + } + static const std::string& arch = ctx.GetStream().GetDeviceName(); if(arch == "gfx90a") InitHeuristicKernelIDs("DeviceGroupedConvBwdWeight_Xdl_CShuffle"); @@ -654,23 +701,26 @@ ConvSolution ConvHipImplicitGemmGroupWrwXdlops::GetSolution( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL return MakeSolutionGroupConvImplicitGemmXdlops( problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryWrwNCHW<2, false, - DeviceOpGWrwPtrs, + DeviceOpGWrwPtrs, CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); }, - [&](auto data_type_val) { - using T = decltype(data_type_val); + [&](auto data_type_val, auto compute_type_val) { + using T = decltype(data_type_val); + using TCompute = decltype(compute_type_val); return InitInvokerFactoryNHWC, + DeviceOpGWrwPtrs, CKArgs, miopen::conv::WrWInvokeParams>( ctx, problem, config.kernel_id); - }); + }, + config.UseTF32()); #else return {}; diff --git a/projects/miopen/src/solver/conv_ck_igemm_grp_fwd_activ_fused.cpp b/projects/miopen/src/solver/conv_ck_igemm_grp_fwd_activ_fused.cpp index 03c4f35874e4..3be3c0cb79e1 100644 --- a/projects/miopen/src/solver/conv_ck_igemm_grp_fwd_activ_fused.cpp +++ b/projects/miopen/src/solver/conv_ck_igemm_grp_fwd_activ_fused.cpp @@ -661,7 +661,7 @@ GetSolutionForDimensionality(const FusionContext& ctx, using Layouts = LayoutsSelector; return MakeSolutionGroupConvImplicitGemmXdlops( conv_problem, - [&](auto data_type_val) { + [&](auto data_type_val, [[maybe_unused]] auto compute_type_val) { (void)data_type_val; return InitInvokerFactoryFwdNCHW( ctx, conv_problem, config.kernel_id); }, - [&](auto data_type_val) { + [&](auto data_type_val, [[maybe_unused]] auto compute_type_val) { (void)data_type_val; return InitInvokerFactoryNHWC; return MakeSolutionGroupConvImplicitGemmXdlops( conv_problem, - [&](auto data_type_val) { + [&](auto data_type_val, [[maybe_unused]] auto compute_type_val) { (void)data_type_val; return InitInvokerFactoryFwdNCHW( ctx, conv_problem, config.kernel_id); }, - [&](auto data_type_val) { + [&](auto data_type_val, [[maybe_unused]] auto compute_type_val) { (void)data_type_val; return InitInvokerFactoryNHWC= 2) { key = opt[0]; @@ -293,7 +294,12 @@ void ParseProblemKey(const std::string& key_, conv::ProblemDescription& prob_des conv::ProblemDescription tmp{in, wei, out, conv, dir}; } conv.group_count = group_cnt; - prob_desc = conv::ProblemDescription{in, wei, out, conv, dir}; + if(precision == miopenFloat) + { + const auto math_type_ = use_tf32 ? miopenMathDefault : miopenMathPedantic; + conv.attribute.Set(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, static_cast(math_type_)); + } + prob_desc = conv::ProblemDescription{in, wei, out, conv, dir}; } struct FDBVal @@ -627,6 +633,7 @@ void CheckDynamicFDBEntry(size_t thread_index, miopen::conv::ProblemDescription problem; miopen::ParseProblemKey(kinder.first, problem); problem.SetupFloats(ctx); // TODO: Check if this is necessary + problem.SetupComputeType(ctx); std::stringstream ss; problem.Serialize(ss); ASSERT_TRUE(ss.str() == kinder.first) @@ -732,6 +739,7 @@ void CheckFDBEntry(size_t thread_index, miopen::conv::ProblemDescription problem; miopen::ParseProblemKey(kinder.first, problem); problem.SetupFloats(ctx); // TODO: Check if this is necessary + problem.SetupComputeType(ctx); std::stringstream ss; problem.Serialize(ss); // moment of truth diff --git a/projects/miopen/test/gtest/group_conv.hpp b/projects/miopen/test/gtest/group_conv.hpp index 27251a14f051..d9ab9e080898 100644 --- a/projects/miopen/test/gtest/group_conv.hpp +++ b/projects/miopen/test/gtest/group_conv.hpp @@ -317,7 +317,16 @@ struct GroupConvTestFix double threshold = 80; if(CONV_DIR == Direction::Forward) { - threshold *= std::numeric_limits::epsilon(); + if constexpr(std::is_same_v) + { + // float use tf32 compute which share same mantissa bits + threshold *= (compute_type == "TF32") ? std::numeric_limits::epsilon() + : std::numeric_limits::epsilon(); + } + else + { + threshold *= std::numeric_limits::epsilon(); + } } else { @@ -353,6 +362,9 @@ struct GroupConvTestFix ctx.SetStream(&handle); + if(compute_type == "TF32") + problem.SetupComputeType(ctx); + if(!solv.IsApplicable(ctx, problem)) { test_skipped = true; @@ -463,6 +475,11 @@ struct GroupConvTestFix test_skipped = true; GTEST_SKIP() << "bf16 tests skipped on this hardware."; } + if(!IsTestSupportedByDevice(Gpu::gfx94X) && compute_type == "TF32") + { + test_skipped = true; + GTEST_SKIP() << "tf32 tests skipped on this hardware."; + } } float alpha_val; @@ -477,6 +494,10 @@ struct GroupConvTestFix weights = tensor{tensor_layout, conv_config.GetWeights()}; conv_desc = conv_config.GetConv(); + if(compute_type == "TF32") + conv_desc.attribute.Set(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, miopenMathDefault); + else + conv_desc.attribute.Set(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, miopenMathPedantic); miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{}); @@ -561,6 +582,8 @@ struct GroupConvTestFix miopen::Scalar alpha{1.0}; miopen::Scalar beta{0.0}; + + std::string compute_type; }; template @@ -613,6 +636,7 @@ std::vector GetBetaValues() struct GPU_GroupConv##ndim##D_##dir##_##naming_type \ : GroupConvTestFix \ { \ + GPU_GroupConv##ndim##D_##dir##_##naming_type() { compute_type = #naming_type; } \ }; \ TEST_P(GPU_GroupConv##ndim##D_##dir##_##naming_type, GroupConv##ndim##D_##dir##_##type##_Test) \ { \ diff --git a/projects/miopen/test/gtest/group_conv2d_fwd.cpp b/projects/miopen/test/gtest/group_conv2d_fwd.cpp index bf1332f97042..188daa298a32 100644 --- a/projects/miopen/test/gtest/group_conv2d_fwd.cpp +++ b/projects/miopen/test/gtest/group_conv2d_fwd.cpp @@ -30,6 +30,7 @@ using namespace group_conv; DEFINE_GROUP_CONV2D_TEST(float, FP32, Forward); +DEFINE_GROUP_CONV2D_TEST(float, TF32, Forward); DEFINE_GROUP_CONV2D_TEST(half, FP16, Forward); DEFINE_GROUP_CONV2D_TEST(bfloat16, BFP16, Forward); DEFINE_GROUP_CONV2D_TEST(int8_t, I8, Forward); diff --git a/projects/miopen/test/gtest/group_conv3d_fwd.cpp b/projects/miopen/test/gtest/group_conv3d_fwd.cpp index 7d661ffe123b..df3d61732ba7 100644 --- a/projects/miopen/test/gtest/group_conv3d_fwd.cpp +++ b/projects/miopen/test/gtest/group_conv3d_fwd.cpp @@ -30,6 +30,7 @@ using namespace group_conv; DEFINE_GROUP_CONV3D_TEST(float, FP32, Forward); +DEFINE_GROUP_CONV3D_TEST(float, TF32, Forward); DEFINE_GROUP_CONV3D_TEST(half, FP16, Forward); DEFINE_GROUP_CONV3D_TEST(bfloat16, BFP16, Forward); /// \todo int8_t tests don't work. Need debugging diff --git a/projects/miopen/test/gtest/solver_bwd.hpp b/projects/miopen/test/gtest/solver_bwd.hpp index e9affc9f6aec..b27f7565c4e6 100644 --- a/projects/miopen/test/gtest/solver_bwd.hpp +++ b/projects/miopen/test/gtest/solver_bwd.hpp @@ -56,6 +56,7 @@ struct ConvBwdSolverTest const miopen::ExecutionContext ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); diff --git a/projects/miopen/test/gtest/solver_fwd.hpp b/projects/miopen/test/gtest/solver_fwd.hpp index 6d74ab151c52..17bfcd0a6aaf 100644 --- a/projects/miopen/test/gtest/solver_fwd.hpp +++ b/projects/miopen/test/gtest/solver_fwd.hpp @@ -58,6 +58,7 @@ struct ConvFwdSolverTest const miopen::ExecutionContext ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); diff --git a/projects/miopen/test/gtest/solver_wrw.hpp b/projects/miopen/test/gtest/solver_wrw.hpp index 1f2285bc051c..4ef34f0faf8c 100644 --- a/projects/miopen/test/gtest/solver_wrw.hpp +++ b/projects/miopen/test/gtest/solver_wrw.hpp @@ -56,6 +56,7 @@ struct ConvWrwSolverTest const miopen::ExecutionContext ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); diff --git a/projects/miopen/test/gtest/unit_FinInterface.cpp b/projects/miopen/test/gtest/unit_FinInterface.cpp index 3ddb91bc1c86..2e4744ce714a 100644 --- a/projects/miopen/test/gtest/unit_FinInterface.cpp +++ b/projects/miopen/test/gtest/unit_FinInterface.cpp @@ -417,6 +417,7 @@ auto GetContext(miopen::Handle* handle, const miopen::conv::ProblemDescription& { auto tmp = miopen::ExecutionContext{handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; } diff --git a/projects/miopen/test/gtest/unit_conv_ConvolutionDescriptor.hpp b/projects/miopen/test/gtest/unit_conv_ConvolutionDescriptor.hpp index 737d4193dfca..16f5796e655d 100644 --- a/projects/miopen/test/gtest/unit_conv_ConvolutionDescriptor.hpp +++ b/projects/miopen/test/gtest/unit_conv_ConvolutionDescriptor.hpp @@ -37,12 +37,14 @@ struct ConvolutionDescriptorParams std::vector&& strides_in, std::vector&& dilations_in, int group_count_in = 1, - bool deterministic_in = false) + bool deterministic_in = false, + bool tf32_compute_in = false) : pads(std::move(pads_in)), strides(std::move(strides_in)), dilations(std::move(dilations_in)), group_count(group_count_in), - deterministic(deterministic_in) + deterministic(deterministic_in), + tf32_compute(tf32_compute_in) { } @@ -60,6 +62,10 @@ struct ConvolutionDescriptorParams { desc.attribute.Set(MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, 1); } + // SET TF32 COMPUTE ATTRIBUTE + miopenMathType_t math_type = tf32_compute ? miopenMathDefault : miopenMathPedantic; + desc.attribute.Set(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, math_type); + return desc; } @@ -78,6 +84,7 @@ struct ConvolutionDescriptorParams std::vector dilations; int group_count; bool deterministic; + bool tf32_compute; }; } // namespace unit_tests diff --git a/projects/miopen/test/gtest/unit_conv_solver.cpp b/projects/miopen/test/gtest/unit_conv_solver.cpp index 50239f9da3ad..f872d46b0d55 100755 --- a/projects/miopen/test/gtest/unit_conv_solver.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver.cpp @@ -341,10 +341,18 @@ miopen::solver::ConvSolution FindSolution(const miopen::solver::conv::ConvSolver template double GetThreshold(miopenConvAlgorithm_t algo, miopen::conv::Direction direction, - const Tolerances& tolerances) + const Tolerances& tolerances, + const bool use_tf32_compute) { double tolerance = tolerances.Get(GetDevGpuType(), miopen_type{}); double threshold = std::numeric_limits::epsilon() * tolerance; + if constexpr(std::is_same_v) + { + if(use_tf32_compute) + { + threshold = std::numeric_limits::epsilon() * tolerance; + } + } return threshold; } @@ -353,7 +361,8 @@ void VerifyData(const std::vector& data, const std::vector& ref_data, miopenConvAlgorithm_t algo, miopen::conv::Direction direction, - const Tolerances& tolerances) + const Tolerances& tolerances, + bool use_tf32_compute = false) { ASSERT_FALSE(miopen::range_zero(ref_data)) << "Reference data is all zeros"; if constexpr(!std::is_integral_v) @@ -380,7 +389,7 @@ void VerifyData(const std::vector& data, else { const auto error = miopen::rms_range(ref_data, data); - const double threshold = GetThreshold(algo, direction, tolerances); + const double threshold = GetThreshold(algo, direction, tolerances, use_tf32_compute); ASSERT_LT(error, threshold) << "Error beyond tolerance"; // std::cout << "error: " << error << " threshold: " << threshold << std::endl; } @@ -436,9 +445,18 @@ void RunSolverFwd(const miopen::solver::conv::ConvSolverInterface& solv, const auto ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); + auto device_name = ctx.GetStream().GetDeviceName(); + if(!(miopen::StartsWith(device_name, "gfx942") || miopen::StartsWith(device_name, "gfx950")) && + conv_config.GetXDataType() == miopenFloat && + conv_config.GetConv().GetMathType() == miopenMathDefault) + { + GTEST_SKIP() << "TF32 test is not supported on this device"; + } + if(!solv.IsApplicable(ctx, problem)) { // Do not put GTEST_SKIP here. @@ -494,8 +512,12 @@ void RunSolverFwd(const miopen::solver::conv::ConvSolverInterface& solv, output.data = handle.Read(out_dev, output.data.size()); - VerifyData( - output.data, ref_out.data, algo, miopen::conv::Direction::Forward, params.tolerances); + VerifyData(output.data, + ref_out.data, + algo, + miopen::conv::Direction::Forward, + params.tolerances, + problem.UseTF32()); } template @@ -557,6 +579,7 @@ void RunSolverBwd(const miopen::solver::conv::ConvSolverInterface& solv, const auto ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); @@ -615,8 +638,12 @@ void RunSolverBwd(const miopen::solver::conv::ConvSolverInterface& solv, input.data = handle.Read(in_dev, input.data.size()); - VerifyData( - input.data, ref_in.data, algo, miopen::conv::Direction::BackwardData, params.tolerances); + VerifyData(input.data, + ref_in.data, + algo, + miopen::conv::Direction::BackwardData, + params.tolerances, + problem.UseTF32()); } template @@ -678,6 +705,7 @@ void RunSolverWrw(const miopen::solver::conv::ConvSolverInterface& solv, const auto ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); @@ -740,7 +768,8 @@ void RunSolverWrw(const miopen::solver::conv::ConvSolverInterface& solv, ref_weights.data, algo, miopen::conv::Direction::BackwardWeights, - params.tolerances); + params.tolerances, + problem.UseTF32()); } template @@ -870,6 +899,7 @@ void UnitTestConvSolverDevApplicabilityBase::RunTestImpl( const auto ctx = [&] { auto tmp = miopen::ExecutionContext{&handle}; problem.SetupFloats(tmp); + problem.SetupComputeType(tmp); return tmp; }(); diff --git a/projects/miopen/test/gtest/unit_conv_solver.hpp b/projects/miopen/test/gtest/unit_conv_solver.hpp index 4ee8c54aeba8..2186c6644edb 100644 --- a/projects/miopen/test/gtest/unit_conv_solver.hpp +++ b/projects/miopen/test/gtest/unit_conv_solver.hpp @@ -37,6 +37,37 @@ namespace miopen { namespace unit_tests { +// Enum class to represent all compute types including TF32 +enum class TestDataType +{ + I8, + FP8, + BF8, + FP16, + BF16, + FP32, + TF32, + FP64, + I64 +}; + +// Helper function to convert TestDataType to miopenDataType_t +constexpr miopenDataType_t GetDataType(TestDataType type) +{ + constexpr miopenDataType_t type_map[] = { + miopenInt8, + miopenFloat8_fnuz, + miopenBFloat8_fnuz, + miopenHalf, + miopenBFloat16, + miopenFloat, + miopenFloat, + miopenDouble, + miopenInt64, + }; + return type_map[static_cast(type)]; +} + //************************************************************************************ // ConvTestCase //************************************************************************************ @@ -229,6 +260,10 @@ using GPU_UnitTestConvSolverFwd_FP32 = miopen::unit_tests::UnitTestConvSolverFwd using GPU_UnitTestConvSolverBwd_FP32 = miopen::unit_tests::UnitTestConvSolverBwd; using GPU_UnitTestConvSolverWrw_FP32 = miopen::unit_tests::UnitTestConvSolverWrw; +using GPU_UnitTestConvSolverFwd_TF32 = miopen::unit_tests::UnitTestConvSolverFwd; +using GPU_UnitTestConvSolverBwd_TF32 = miopen::unit_tests::UnitTestConvSolverBwd; +using GPU_UnitTestConvSolverWrw_TF32 = miopen::unit_tests::UnitTestConvSolverWrw; + using GPU_UnitTestConvSolverFwd_I8 = miopen::unit_tests::UnitTestConvSolverFwd; using GPU_UnitTestConvSolverBwd_I8 = miopen::unit_tests::UnitTestConvSolverBwd; using GPU_UnitTestConvSolverWrw_I8 = miopen::unit_tests::UnitTestConvSolverWrw; diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupBwdXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupBwdXdlops.cpp index f5e9a9df3dc8..f0f939847ec6 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupBwdXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupBwdXdlops.cpp @@ -6,33 +6,38 @@ namespace { // numeric part of test case -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; +template auto GetConvSmokeTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off //TestCase {{1, 4, 14, 28, 28}, {4, 4, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 1} - TestCase {{1, 4, 8, 28, 28}, {4, 4, 3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 1} + TestCase {{1, 4, 8, 28, 28}, {4, 4, 3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 1, false, tf32_compute} // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off // Group Count = 1 - TestCase {{1, 1, 8, 8, 8}, {1, 1, 2, 2, 2}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}, 1}, - TestCase {{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1}, + TestCase {{1, 1, 8, 8, 8}, {1, 1, 2, 2, 2}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}, 1, false, tf32_compute}, + TestCase {{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1, false, tf32_compute}, // Group Count > 1 (2, 3, 4) - TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 2}, - TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 2}, - TestCase {{256, 9, 2, 14, 14}, {27, 3, 2, 14, 14}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3}, - TestCase {{128, 4, 28, 28, 28}, {8, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 4} + TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 2, false, tf32_compute}, + TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 2, false, tf32_compute}, + TestCase {{256, 9, 2, 14, 14}, {27, 3, 2, 14, 14}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3, false, tf32_compute}, + TestCase {{128, 4, 28, 28, 28}, {8, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 4, false, tf32_compute} // clang-format on }; @@ -43,20 +48,25 @@ auto GetDevApplicabilityConvCase() { // For device applicability checks return GetConvTestForGroupXdlops(miopenTensorNDHWC, - std::move(GetConvSmokeTestCases()[0])); + std::move(GetConvSmokeTestCases()[0])); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - - if constexpr(datatype != miopenFloat) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32) + { + supportedDevices = Gpu::gfx94X | Gpu::gfx950; + }else{ + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -84,6 +94,10 @@ using GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_BFP16 = using GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; +using GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; + using CPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityBwd_NONE; @@ -103,6 +117,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP32, ConvHipImplicitG this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupBwdXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_TF32, ConvHipImplicitGemm3DGroupBwdXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupBwdXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlopsDevApplicability_FP16, ConvHipImplicitGemm3DGroupBwdXdlops) { @@ -112,43 +131,55 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlopsDevApplicability_FP16, // Smoke tests INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); + testing::ValuesIn(GetConvSmokeTestCases()))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); + testing::ValuesIn(GetConvSmokeTestCases()))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); + testing::ValuesIn(GetConvFullTestCases()))); INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); + testing::ValuesIn(GetConvFullTestCases()))); INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P(Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemm3DGroupBwdXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupFwdXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupFwdXdlops.cpp old mode 100755 new mode 100644 index 261fd2ed09f9..ea9d66dd78a5 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupFwdXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupFwdXdlops.cpp @@ -6,32 +6,37 @@ namespace { // numeric part of test case -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; +template auto GetConvSmokeTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off - TestCase{{64, 32, 28, 28, 28}, {32, 32, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 1} + TestCase{{64, 32, 28, 28, 28}, {32, 32, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 1, false, tf32_compute} // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off // Group Count 1 - TestCase{{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1}, - TestCase{{128, 3, 2, 14, 14}, {320, 3, 2, 14, 14}, {0, 0, 0}, {2, 14, 14}, {1, 1, 1}, 1}, + TestCase{{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1, false, tf32_compute}, + TestCase{{128, 3, 2, 14, 14}, {320, 3, 2, 14, 14}, {0, 0, 0}, {2, 14, 14}, {1, 1, 1}, 1, false, tf32_compute}, // Group Count > 1 (2, 3, 5, 16) - TestCase{{128, 32, 28, 28, 28}, {32, 16, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 2}, - TestCase{{48, 48, 28, 28, 28}, {48, 16, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3}, - TestCase{{120, 60, 28, 28, 28}, {60, 12, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 5}, - TestCase{{64, 32, 28, 28, 28}, {32, 2, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 16}, + TestCase{{128, 32, 28, 28, 28}, {32, 16, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 2, false, tf32_compute}, + TestCase{{48, 48, 28, 28, 28}, {48, 16, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3, false, tf32_compute}, + TestCase{{120, 60, 28, 28, 28}, {60, 12, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 5, false, tf32_compute}, + TestCase{{64, 32, 28, 28, 28}, {32, 2, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 16, false, tf32_compute}, // clang-format on }; @@ -42,7 +47,7 @@ auto GetDevApplicabilityConvCase() { // For device applicability checks return GetConvTestForGroupXdlops(miopenTensorNDHWC, - std::move(GetConvSmokeTestCases()[0])); + std::move(GetConvSmokeTestCases()[0])); } // Deterministic test case (for CPU deterministic applicability test) @@ -57,17 +62,25 @@ auto GetDeterministicConvCase() return GetConvTestForGroupXdlops(miopenTensorNDHWC, std::move(test_case)); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - - if constexpr(datatype != miopenFloat) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32) + { + supportedDevices = Gpu::gfx94X | Gpu::gfx950; + } + else { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | + Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -96,6 +109,10 @@ using GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_BFP16 = using GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; +using GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; + using CPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityFwd_NONE; using CPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlopsDeterministicApplicability_NONE = @@ -117,6 +134,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32, ConvHipImplicitG this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupFwdXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_TF32, ConvHipImplicitGemm3DGroupFwdXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupFwdXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlopsDevApplicability_FP16, ConvHipImplicitGemm3DGroupFwdXdlops) { @@ -130,47 +152,67 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlopsDeterministicApplicabil }; // Smoke tests -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemm3DGroupFwdXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); INSTANTIATE_TEST_SUITE_P( diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupWrwXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupWrwXdlops.cpp index 00bf997b0c69..619fef62da2e 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupWrwXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemm3DGroupWrwXdlops.cpp @@ -6,32 +6,37 @@ namespace { // numeric part of test case -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; +template auto GetConvSmokeTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off - TestCase {{1, 4, 8, 28, 28}, {4, 4, 3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 1} + TestCase {{1, 4, 8, 28, 28}, {4, 4, 3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 1, false, tf32_compute} // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { + const bool tf32_compute = type == TestDataType::TF32; std::vector test_cases = { // clang-format off // Group Count = 1 - TestCase {{1, 1, 8, 8, 8}, {1, 1, 2, 2, 2}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}, 1}, - TestCase {{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1}, + TestCase {{1, 1, 8, 8, 8}, {1, 1, 2, 2, 2}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}, 1, false, tf32_compute}, + TestCase {{6, 448, 3, 118, 182}, {896, 448, 1, 1, 1}, {0, 0, 0}, {1, 2, 2}, {1, 1, 1}, 1, false, tf32_compute}, // Group Count > 1 (2, 3, 4) - TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, 2}, - TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 2}, - TestCase {{256, 9, 2, 14, 14}, {27, 3, 2, 14, 14}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3}, - TestCase {{128, 4, 28, 28, 28}, {8, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 4} + TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, 2, false, tf32_compute}, + TestCase {{128, 2, 28, 28, 28}, {2, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 2, false, tf32_compute}, + TestCase {{256, 9, 2, 14, 14}, {27, 3, 2, 14, 14}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 3, false, tf32_compute}, + TestCase {{128, 4, 28, 28, 28}, {8, 1, 3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 4, false, tf32_compute} // clang-format on }; @@ -42,7 +47,7 @@ auto GetDevApplicabilityConvCase() { // For device applicability checks return GetConvTestForGroupXdlops(miopenTensorNDHWC, - std::move(GetConvSmokeTestCases()[0])); + std::move(GetConvSmokeTestCases()[0])); } // Deterministic test case (for CPU deterministic applicability test) @@ -57,20 +62,25 @@ auto GetDeterministicConvCase() return GetConvTestForGroupXdlops(miopenTensorNDHWC, std::move(test_case)); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - - if(datatype == miopenBFloat16) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32 || type == TestDataType::BF16) + { supportedDevices = Gpu::gfx94X | Gpu::gfx950; - - if constexpr(datatype != miopenFloat) + } + else { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | + Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -103,6 +113,10 @@ using GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_BFP16 = using GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; +using GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; + using CPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityWrw_NONE; using CPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlopsDeterministicApplicability_NONE = @@ -124,6 +138,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32, ConvHipImplicitG this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupWrwXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_TF32, ConvHipImplicitGemm3DGroupWrwXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemm3DGroupWrwXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlopsDevApplicability_FP16, ConvHipImplicitGemm3DGroupWrwXdlops) { @@ -137,47 +156,67 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlopsDeterministicApplicabil }; // Smoke tests -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_BFP16, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32, - testing::Combine(testing::Values(GetTestParams()), - testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNDHWC, miopenTensorNCDHW), + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemm3DGroupWrwXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParams()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); INSTANTIATE_TEST_SUITE_P( diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupBwdXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupBwdXdlops.cpp index f39f13c0e1d0..2e8cbfbc7632 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupBwdXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupBwdXdlops.cpp @@ -5,31 +5,38 @@ namespace { -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; // Non-deterministic test cases (for GPU smoke tests) +template auto GetConvSmokeTestCases() { + const bool tf32_compute = type == TestDataType::TF32; + static std::vector test_cases = { // clang-format off - TestCase{{1, 32, 8, 8}, {32, 32, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1} + TestCase{{1, 32, 8, 8}, {32, 32, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1, false, tf32_compute} // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { + const bool tf32_compute = type == TestDataType::TF32; + static std::vector test_cases = { // clang-format off - TestCase{{1, 32, 8, 8}, {32, 32, 3, 3}, {1, 1}, {1, 1}, {1, 1}, 1}, // non-zero padding - TestCase{{1, 64, 24, 48}, {96, 64, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1}, // stride > 1 - TestCase{{1, 32, 8, 8}, {32, 32, 3, 3}, {0, 0}, {1, 1}, {3, 3}, 1}, // dilation > 1 - TestCase{{1, 64, 24, 48}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1}, + TestCase{{1, 32, 8, 8}, {32, 32, 3, 3}, {1, 1}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // non-zero padding + TestCase{{1, 64, 24, 48}, {96, 64, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1, false, tf32_compute}, // stride > 1 + TestCase{{1, 32, 8, 8}, {32, 32, 3, 3}, {0, 0}, {1, 1}, {3, 3}, 1, false, tf32_compute}, // dilation > 1 + TestCase{{1, 64, 24, 48}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // Group count = 2 and 4 - TestCase{{1, 32, 8, 8}, {32, 16, 3, 3}, {0, 0}, {1, 1}, {3, 3}, 2}, // dilation > 1 - TestCase{{1, 64, 24, 48}, {96, 16, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 4}, + TestCase{{1, 32, 8, 8}, {32, 16, 3, 3}, {0, 0}, {1, 1}, {3, 3}, 2, false, tf32_compute}, // dilation > 1 + TestCase{{1, 64, 24, 48}, {96, 16, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 4, false, tf32_compute}, // clang-format on }; @@ -39,8 +46,8 @@ auto GetConvFullTestCases() auto GetDevApplicabilityConvCase() { // For device applicability checks - return GetConvTestForGroupXdlops(miopenTensorNHWC, - std::move(GetConvSmokeTestCases()[0])); + return GetConvTestForGroupXdlops( + miopenTensorNHWC, std::move(GetConvSmokeTestCases()[0])); } // Deterministic test case (for CPU deterministic applicability test) @@ -55,17 +62,25 @@ auto GetDeterministicConvCase() return GetConvTestForGroupXdlops(miopenTensorNHWC, std::move(test_case)); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - - if constexpr(datatype != miopenFloat) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32) { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx94X | Gpu::gfx950; + } + else + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | + Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -77,10 +92,6 @@ const auto& GetTestParams() return params; } -const auto& GetTestParamsFP16() { return GetTestParams(); } -const auto& GetTestParamsBFP16() { return GetTestParams(); } -const auto& GetTestParamsFP32() { return GetTestParams(); } - } // namespace using GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP16 = @@ -92,6 +103,9 @@ using GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_BFP16 = using GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; +using GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; using CPU_UnitTestConvSolverImplicitGemmGroupBwdXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityBwd_NONE; using CPU_UnitTestConvSolverImplicitGemmGroupBwdXdlopsDeterministicApplicability_NONE = @@ -112,6 +126,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32, ConvHipImplicitGem this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupBwdXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_TF32, ConvHipImplicitGemmGroupBwdXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupBwdXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupBwdXdlopsDevApplicability_FP16, ConvHipImplicitGemmGroupBwdXdlops) { @@ -125,47 +144,67 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupBwdXdlopsDeterministicApplicabilit }; // Smoke tests -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupBwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemmGroupBwdXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); INSTANTIATE_TEST_SUITE_P( diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupFwdXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupFwdXdlops.cpp index dd44c4a1e590..6c7b126903c7 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupFwdXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupFwdXdlops.cpp @@ -6,30 +6,37 @@ namespace { // numeric part of test case -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; +template auto GetConvSmokeTestCases() { - std::vector test_cases = { + const bool tf32_compute = type == TestDataType::TF32; + + static std::vector test_cases = { // clang-format off - TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1} + TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { - std::vector test_cases = { + const bool tf32_compute = type == TestDataType::TF32; + + static std::vector test_cases = { // clang-format off - TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {1, 1}, {1, 1}, {1, 1}, 1}, // non-zero padding - TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1}, // stride > 1 - TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 1}, // dilation > 1 - TestCase{{1, 32, 24, 48}, {192, 32, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1}, + TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {1, 1}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // non-zero padding + TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1, false, tf32_compute}, // stride > 1 + TestCase{{1, 32, 8, 8}, {48, 32, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 1, false, tf32_compute}, // dilation > 1 + TestCase{{1, 32, 24, 48}, {192, 32, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // Group count = 2 and 4 - TestCase{{1, 32, 8, 8}, {48, 16, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 2}, // dilation > 1 - TestCase{{1, 32, 24, 48}, {96, 8, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 4}, + TestCase{{1, 32, 8, 8}, {48, 16, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 2, false, tf32_compute}, // dilation > 1 + TestCase{{1, 32, 24, 48}, {96, 8, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 4, false, tf32_compute}, // clang-format on }; @@ -39,8 +46,8 @@ auto GetConvFullTestCases() auto GetDevApplicabilityConvCase() { // For device applicability checks - return GetConvTestForGroupXdlops(miopenTensorNHWC, - std::move(GetConvSmokeTestCases()[0])); + return GetConvTestForGroupXdlops( + miopenTensorNHWC, std::move(GetConvSmokeTestCases()[0])); } // Deterministic test case (for CPU deterministic applicability test) @@ -55,16 +62,25 @@ auto GetDeterministicConvCase() return GetConvTestForGroupXdlops(miopenTensorNHWC, std::move(test_case)); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - if constexpr(datatype != miopenFloat) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32) + { + supportedDevices = Gpu::gfx94X | Gpu::gfx950; + } + else { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | + Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -76,11 +92,6 @@ const auto& GetTestParams() return params; } -const auto& GetTestParamsI8() { return GetTestParams(); } -const auto& GetTestParamsFP16() { return GetTestParams(); } -const auto& GetTestParamsBFP16() { return GetTestParams(); } -const auto& GetTestParamsFP32() { return GetTestParams(); } - } // namespace using GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_I8 = @@ -93,6 +104,9 @@ using GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_BFP16 = using GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; +using GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; using CPU_UnitTestConvSolverImplicitGemmGroupFwdXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityFwd_NONE; using CPU_UnitTestConvSolverImplicitGemmGroupFwdXdlopsDeterministicApplicability_NONE = @@ -118,6 +132,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32, ConvHipImplicitGem this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupFwdXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_TF32, ConvHipImplicitGemmGroupFwdXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupFwdXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupFwdXdlopsDevApplicability_FP16, ConvHipImplicitGemmGroupFwdXdlops) { @@ -131,59 +150,81 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupFwdXdlopsDeterministicApplicabilit }; // Smoke tests -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_I8, - testing::Combine(testing::Values(GetTestParamsI8()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_I8, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_I8, - testing::Combine(testing::Values(GetTestParamsI8()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); - -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_I8, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupFwdXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemmGroupFwdXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); INSTANTIATE_TEST_SUITE_P( diff --git a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupWrwXdlops.cpp b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupWrwXdlops.cpp index a9b3d9472f73..e2ed9d6b680c 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupWrwXdlops.cpp +++ b/projects/miopen/test/gtest/unit_conv_solver_ConvHipImplicitGemmGroupWrwXdlops.cpp @@ -6,30 +6,37 @@ namespace { // numeric part of test case -using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestCase = miopen::unit_tests::GroupXdlopsNumericData; +using TestDataType = miopen::unit_tests::TestDataType; +template auto GetConvSmokeTestCases() { + const bool tf32_compute = type == TestDataType::TF32; + static std::vector test_cases = { // clang-format off - TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1} + TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {1, 1}, {1, 1}, 1, false, tf32_compute} // clang-format on }; return test_cases; } +template auto GetConvFullTestCases() { + const bool tf32_compute = type == TestDataType::TF32; + static std::vector test_cases = { // clang-format off - TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {1, 1}, {1, 1}, {1, 1}, 1}, // non-zero padding - TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1}, // stride > 1 + TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {1, 1}, {1, 1}, {1, 1}, 1, false, tf32_compute}, // non-zero padding + TestCase{{1, 64, 8, 8}, {96, 64, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 1, false, tf32_compute}, // stride > 1 // Group count = 2 and 4 - TestCase{{1, 64, 8, 8}, {96, 32, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 2}, // dilation > 1 - TestCase{{1, 64, 8, 8}, {96, 16, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 4}, // stride > 1 - // clang-format on + TestCase{{1, 64, 8, 8}, {96, 32, 1, 1}, {0, 0}, {1, 1}, {2, 2}, 2, false, tf32_compute}, // dilation > 1 + TestCase{{1, 64, 8, 8}, {96, 16, 1, 1}, {0, 0}, {2, 2}, {1, 1}, 4, false, tf32_compute}, // stride > 1 + // clang-format on }; return test_cases; @@ -38,8 +45,8 @@ auto GetConvFullTestCases() auto GetDevApplicabilityConvCase() { // For device applicability checks - return GetConvTestForGroupXdlops(miopenTensorNHWC, - std::move(GetConvSmokeTestCases()[0])); + return GetConvTestForGroupXdlops( + miopenTensorNHWC, std::move(GetConvSmokeTestCases()[0])); } // Deterministic test case (for CPU deterministic applicability test) @@ -54,19 +61,25 @@ auto GetDeterministicConvCase() return GetConvTestForGroupXdlops(miopenTensorNHWC, std::move(test_case)); } -template +template const auto& GetTestParams() { static const auto params = [] { // If MIOpen is built without CK these tests will fail, skip them to avoid failing #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; - if(datatype == miopenBFloat16) + Gpu supportedDevices; + if constexpr(type == TestDataType::FP32) + { + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950; + } + else if constexpr(type == TestDataType::TF32 || type == TestDataType::BF16) + { supportedDevices = Gpu::gfx94X | Gpu::gfx950; - - if constexpr(datatype != miopenFloat) + } + else { - supportedDevices = supportedDevices | Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; + supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X | Gpu::gfx950 | + Gpu::gfx110X | Gpu::gfx115X | Gpu::gfx120X; } #else Gpu supportedDevices = Gpu::None; @@ -78,10 +91,6 @@ const auto& GetTestParams() return params; } -const auto& GetTestParamsFP16() { return GetTestParams(); } -const auto& GetTestParamsBFP16() { return GetTestParams(); } -const auto& GetTestParamsFP32() { return GetTestParams(); } - } // namespace // Solver itself supports I8 in isApplicable, but CK returns 0 compatible kernels @@ -89,15 +98,19 @@ const auto& GetTestParamsFP32() { return GetTestParams(); } using GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP16 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; -; + using GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_BFP16 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; -; + using GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32 = miopen::unit_tests::UnitTestConvSolverGroupXDlops; -; + +using GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_TF32 = + miopen::unit_tests::UnitTestConvSolverGroupXDlops; + using CPU_UnitTestConvSolverImplicitGemmGroupWrwXdlopsDevApplicability_FP16 = CPU_UnitTestConvSolverDevApplicabilityWrw_NONE; using CPU_UnitTestConvSolverImplicitGemmGroupWrwXdlopsDeterministicApplicability_NONE = @@ -118,6 +131,11 @@ TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32, ConvHipImplicitGem this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupWrwXdlops{}); }; +TEST_P(GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_TF32, ConvHipImplicitGemmGroupWrwXdlops) +{ + this->RunTest(miopen::solver::conv::ConvHipImplicitGemmGroupWrwXdlops{}); +}; + TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupWrwXdlopsDevApplicability_FP16, ConvHipImplicitGemmGroupWrwXdlops) { @@ -131,48 +149,67 @@ TEST_P(CPU_UnitTestConvSolverImplicitGemmGroupWrwXdlopsDeterministicApplicabilit }; // Smoke tests -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvSmokeTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); + +INSTANTIATE_TEST_SUITE_P( + Smoke, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvSmokeTestCases()))); // Full tests -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_BFP16, - testing::Combine(testing::Values(GetTestParamsBFP16()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); -INSTANTIATE_TEST_SUITE_P(Full, - GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32, - testing::Combine(testing::Values(GetTestParamsFP32()), - testing::Values(miopenTensorNHWC, miopenTensorNCHW), - testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); +INSTANTIATE_TEST_SUITE_P( + Full, + GPU_UnitTestConvSolverImplicitGemmGroupWrwXdlops_TF32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenTensorNHWC, miopenTensorNCHW), + testing::ValuesIn(GetConvFullTestCases()))); // Device applicability tests INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverImplicitGemmGroupWrwXdlopsDevApplicability_FP16, - testing::Combine(testing::Values(GetTestParamsFP16()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetDevApplicabilityConvCase()))); INSTANTIATE_TEST_SUITE_P( diff --git a/projects/miopen/test/gtest/unit_conv_solver_group_xdlops.hpp b/projects/miopen/test/gtest/unit_conv_solver_group_xdlops.hpp index 543f098c114c..b715914b1d07 100644 --- a/projects/miopen/test/gtest/unit_conv_solver_group_xdlops.hpp +++ b/projects/miopen/test/gtest/unit_conv_solver_group_xdlops.hpp @@ -20,6 +20,7 @@ struct GroupXdlopsNumericData unsigned int group_count; bool deterministic = false; + bool tf32_compute = false; friend std::ostream& operator<<(std::ostream& os, const GroupXdlopsNumericData& numeric_data) { @@ -31,13 +32,15 @@ struct GroupXdlopsNumericData LogVector(os, "dilation", numeric_data.dilation); os << "group_count: " << numeric_data.group_count; os << "deterministic: " << numeric_data.deterministic; + os << "tf32_compute: " << numeric_data.tf32_compute; os << "]"; return os; } private: template - static void LogVector(std::ostream& os, const std::string& vec_name, const std::vector& vec) + inline static void + LogVector(std::ostream& os, const std::string& vec_name, const std::vector& vec) { os << vec_name << ": ["; for(size_t i = 0; i < vec.size(); ++i) @@ -61,7 +64,8 @@ ConvTestCase GetConvTestForGroupXdlops(miopenTensorLayout_t layout, std::move(conv_numeric_data.stride), std::move(conv_numeric_data.dilation), std::move(conv_numeric_data.group_count), - conv_numeric_data.deterministic}}; + conv_numeric_data.deterministic, + conv_numeric_data.tf32_compute}}; return conv_test_case; } diff --git a/projects/miopen/test/utils/gtest_formating_checks.py b/projects/miopen/test/utils/gtest_formating_checks.py index 2a3892ac8910..c87140c9b3c8 100644 --- a/projects/miopen/test/utils/gtest_formating_checks.py +++ b/projects/miopen/test/utils/gtest_formating_checks.py @@ -46,7 +46,7 @@ # Valid enums and Regex for validation VALID_HW_TYPES = {"CPU", "GPU"} -VALID_DATATYPES = {"FP8", "FP16", "FP32", "FP64", "BFP16", "BFP8", "I64", "I32", "I16", "I8", "NONE"} +VALID_DATATYPES = {"FP8", "FP16", "FP32", "TF32", "FP64", "BFP16", "BFP8", "I64", "I32", "I16", "I8", "NONE"} # Our suite (or fixture) naming convention: must start with CPU or GPU, followed by one or more alphanum groups, and end with a valid datatype. TESTSUITE_REGEX = re.compile( r"^(CPU|GPU)_[A-Za-z0-9]+(?:_[A-Za-z0-9]+)*_(" + "|".join(VALID_DATATYPES) + r")$"