ROCm · yingluAMD · Nov 16, 2025 · Sep 1, 2025 · Sep 12, 2025 · Sep 15, 2025
@@ -156,6 +156,10 @@ pipeline {
             name: "DATATYPE_FP32",
             defaultValue: true,
             description: "")
+        booleanParam(
+            name: "DATATYPE_TF32",
+            defaultValue: true,
+            description: "")
         booleanParam(
             name: "DATATYPE_FP16",
             defaultValue: true,
@@ -199,6 +203,7 @@ pipeline {
         Bf16_flags      = " -DMIOPEN_TEST_BFLOAT16=On"
         Int8_flags      = " -DMIOPEN_TEST_INT8=On"
         Full_test       = " -DMIOPEN_TEST_ALL=On"
+        Tf32_flags      = " -DMIOPEN_TEST_TF32=On"
 
         gfx908_flags    = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx908"
         gfx90a_flags    = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx90a"
@@ -526,6 +531,24 @@ pipeline {
                         runBuildAndSingleGtestJob(Full_test + gfx1101_flags, Build_timeout_minutes)
                     }
                 }
+                stage('TF32 Hip All gfx942') {
+                    when {
+                        beforeAgent true
+                        expression { params.TARGET_GFX942 && params.DATATYPE_TF32 }
+                    }
+                    options {
+                        retry(2)
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    steps{
+                        runBuildAndSingleGtestJob(Full_test + Tf32_flags + gfx942_flags, Build_timeout_minutes)
+                    }
+                    post {
+                        always {
+                            cleanWs()
+                        }
+                    }
+                }
             }
         }
         stage("Nightly Tests") {

@@ -456,6 +456,12 @@ class ConvDriver : public Driver
         constexpr bool is_bfp8 = std::is_same<Tgpu, bfloat8_fnuz>::value;
         if(is_bfp8 || is_fp8 || TensorsCasted())
             tolerance *= 37.0;
+
+        { // tf32 has same mantissa length as fp16
+            auto math_type_ = inflags.GetValueInt("math_type");
+            if(std::is_same_v<Tgpu, float> && (miopen::EnvEnableTF32() || math_type_))
+                tolerance = 8.2e-3;
+        }
         return tolerance;
     }
 
@@ -868,6 +874,8 @@ int ConvDriver<Tgpu, Tref>::GetandSetData()
             warmupConvDesc,
             static_cast<int>(miopenConvolutionFindModeNormal)); // Repeat via hidden API.
         miopenSetConvolutionGroupCount(warmupConvDesc, group_count);
+        miopenSetConvolutionAttribute(
+            warmupConvDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, inflags.GetValueInt("math_type"));
 
         int warmup_out_len_size = miopen::deref(warmupInputTensor).GetNumDims();
         std::vector<int> warmup_out_len(warmup_out_len_size);
@@ -1022,6 +1030,8 @@ int ConvDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "0",
                          "MIOpen tuning policy (Default=0, or no tuning policy set)",
                          "int");
+    // TODO:(LYM) change back to 0
+    inflags.AddInputFlag("math_type", 'M', "1", "math type of compute (Default=1)", "int");
 
     return 0;
 }
@@ -1226,6 +1236,14 @@ int ConvDriver<Tgpu, Tref>::SetConvDescriptorFromCmdLineArgs()
         miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data());
     }
 
+    auto math_type_ = inflags.GetValueInt("math_type");
+    if(math_type_ < miopenMathDefault || math_type_ > miopenMathPedantic)
+    {
+        std::cout << "Invalid math_type value: " << math_type_ << std::endl;
+        exit(0); // NOLINT (concurrency-mt-unsafe)
+    }
+    miopenSetConvolutionAttribute(convDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, math_type_);
+
     return miopenStatusSuccess;
 }
 

@@ -120,6 +120,14 @@ typedef enum
     miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */
 } miopenStatus_t;
 
+typedef enum
+{
+    // TODO:(LYM) temporary use Pedantic as default until TF32 is fully supported
+    miopenMathDefault = 0, /*!< Use TF32 if possible */
+    miopenMathPedantic =
+        1, /*!< Default MathType. Strict IEEE compliance. Don't allow datatype down conversion. */
+} miopenMathType_t;
+
 #ifdef MIOPEN_BETA_API
 typedef enum
 {
@@ -639,6 +647,9 @@ typedef enum
 #else
 // miopenReserved1 = 2,
 #endif
+    // TODO:(LYM) temporarily use Pedantic as default until TF32 is fully supported
+    MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE =
+        3, /*!< refer to miopenMathType_t,default is miopenMathPedantic >*/
 } miopenConvolutionAttrib_t;
 
 /*! @ingroup convolutions

@@ -186,9 +186,14 @@ void ProblemDescription::MakeNetworkConfig(std::string& conf_key) const
         ss << 'x' << GetWeightsLayout();
         ss << 'x' << GetOutLayout();
     }
-    ss << 'x' << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    const auto data_type =
+        EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    ss << 'x' << data_type;
 
     std::ostringstream optional;
+    if(data_type == "FP32" && UseTF32())
+        optional << "TF32" << 'x';
+
     if(const auto ct = GetInCastType())
         optional << "ci" << GetDataTypeName(*ct);
     if(const auto ct = GetWeightsCastType())
@@ -239,10 +244,12 @@ void ProblemDescription::Serialize(std::ostream& stream) const
         stream << sep << GetWeightsLayout();
         stream << sep << GetOutLayout();
     }
-    stream << sep << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    // clang-format on
+    const auto data_type =
+        EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    stream << sep << data_type;
     stream << sep << GetDirectionStr();
 
-    // clang-format on
     // New performance config entries shall come into variable/optional part of db key.
     // This is to support backward compatibility with previous versions of databases.
     std::ostringstream optional;
@@ -257,6 +264,10 @@ void ProblemDescription::Serialize(std::ostream& stream) const
             optional << "_cw" << GetDataTypeName(*ct);
         if(const auto ct = GetOutCastType())
             optional << "_co" << GetDataTypeName(*ct);
+
+        // cx indicates compute datatype
+        if(data_type == "FP32" && UseTF32())
+            optional << "_cxTF32";
     }
     if(!optional.str().empty())
     {
@@ -316,6 +327,14 @@ void ProblemDescription::SetupFloats(ExecutionContext& ctx) const
                  << "x" << GetDataTypeName(GetOutDataType()));
 }
 
+void ProblemDescription::SetupComputeType(const ExecutionContext& ctx) const
+{
+    if(ctx.GetStream().GetDeviceName() == "gfx942" && conv.EnableTF32())
+    {
+        use_tf32 = true;
+    }
+}
+
 std::string ProblemDescription::ComputeLayout(const TensorDescriptor& td) const
 {
     return td.GetLayout_str();

@@ -302,7 +302,8 @@ std::vector<Solution> EvaluateInvokers(const Handle& handle,
 
             MIOPEN_THROW_IF(elapsed <= 0, "Invalid elapsed time detected in EvaluateInvokers");
 
-            MIOPEN_LOG_I(sol << ": " << elapsed << (elapsed < best ? " < " : " >= ") << best);
+            MIOPEN_LOG_I("solution(current vs best):" << sol << ": " << elapsed
+                                                      << (elapsed < best ? " < " : " >= ") << best);
             if(elapsed < best)
             {
                 best         = elapsed;

@@ -468,6 +468,18 @@ std::size_t ConvolutionDescriptor::GetWorkSpaceSize(ExecutionContext ctx,
     return workspace_size;
 }
 
+bool ConvolutionDescriptor::EnableTF32() const
+{
+    /* true only when both EnvEnableTF32() and (MathType==Default) are true. */
+    // temporarily disable TF32 until tf32 feature are fully complete validated with database.
+    // TODO:(LYM) change back to &&
+    if((miopen::EnvEnableTF32() ||
+        (static_cast<miopenMathType_t>(attribute.Get(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)) ==
+         miopenMathDefault)))
+        return true;
+    return false;
+}
+
 std::ostream& operator<<(std::ostream& stream, const ConvolutionDescriptor& c)
 {
     stream << "conv" << c.spatialDim << "d, ";
@@ -540,6 +552,18 @@ void ConvolutionAttribute::Set(miopenConvolutionAttrib_t attr, int value)
         }
         fp8rounding_mode.rounding_mode = rounding_mode;
     }
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)
+    {
+        const auto math_type_ = static_cast<miopenMathType_t>(value);
+        if(math_type_ != miopenMathDefault && math_type_ != miopenMathPedantic)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "[Set conv attribute] Error: Attempt to set invalid value for "
+                         "MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE: " +
+                             std::to_string(value));
+        }
+        math_type.value = math_type_;
+    }
     else
     {
         MIOPEN_THROW(miopenStatusBadParm,
@@ -556,6 +580,8 @@ int ConvolutionAttribute::Get(miopenConvolutionAttrib_t attr) const
         return static_cast<int>(fp8rounding_mode.rounding_mode);
     else if(attr == MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC)
         return deterministic.value;
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)
+        return math_type.value;
     MIOPEN_THROW(miopenStatusBadParm,
                  "[Get conv attribute] Error: Attribute [" +
                      std::to_string(static_cast<int>(attr)) + "] does not exist.");

@@ -67,6 +67,7 @@ static inline auto MakeFwdCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 
@@ -86,6 +87,7 @@ static inline auto MakeBwdCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 
@@ -111,6 +113,7 @@ static inline auto MakeWrWCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 

@@ -296,6 +296,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
         return GetInCastType() || GetWeightsCastType() || GetOutCastType();
     }
 
+    bool UseTF32() const { return use_tf32; }
+
     // To be used in Solvers that do not implement ALT FP16 kernels.
     // Those Solvers must be non-applicable for gfx90a when this function returns true.
     bool IsGfx90aFp16altRequired() const
@@ -396,6 +398,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
             self.GetInDataType(), self.GetWeightsDataType(), self.GetOutDataType());
         f(data_type, "data_type");
         f(self.GetDirectionStr(), "direction");
+        if(data_type == "FP32" && self.UseTF32())
+            f("TF32", "compute_datatype");
     }
 
     template <class Self, class Visitor>
@@ -407,6 +411,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     }
 
     void SetupFloats(ExecutionContext& ctx) const;
+    void SetupComputeType(const ExecutionContext& ctx) const;
 
 private:
     std::string ComputeLayout(const TensorDescriptor& td) const;
@@ -426,6 +431,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     Scalar alpha                          = Scalar(1.0);
     Scalar beta                           = Scalar(0.0);
     miopenAlphaBetaCase_t alpha_beta_case = DEFAULT;
+    mutable bool use_tf32                 = false;
 };
 
 } // namespace conv

@@ -4524,13 +4524,15 @@ struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
     MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const;
     MIOPEN_INTERNALS_EXPORT bool
     operator==(const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const;
+    bool UseTF32() const { return use_tf32; }
 
 private:
-    template <typename DataType>
-    void Init(const miopen::conv::ProblemDescription&);
-    template <typename DataType>
+    template <typename DataType, typename ComputeType = DataType>
+    bool Init(const miopen::conv::ProblemDescription&);
+    template <typename DataType, typename ComputeType = DataType>
     bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const;
     void InitValidKernels(const miopen::conv::ProblemDescription& problem);
+    mutable bool use_tf32 = false;
 };
 
 struct ConvHipImplicitGemm3DGroupFwdXdlops final
@@ -4567,7 +4569,7 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
     bool MayNeedWorkspace() const override { return true; }
 
 private:
-    template <typename DataType>
+    template <typename DataType, typename ComputeType = DataType>
     bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const;
 };
 

@@ -53,8 +53,15 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC)
 MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE)
 MIOPEN_DECLARE_ENV_VAR_UINT64(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED)
 
+// disable TF32 by default temporarily until we fully complete this feature.
+// TODO:(LYM) change back
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TF32_OVERRIDE, 0);
+MIOPEN_DECLARE_ENV_VAR_BOOL(NVIDIA_TF32_OVERRIDE, 0);
+
 namespace miopen {
 
+MIOPEN_INTERNALS_EXPORT bool EnvEnableTF32();
+
 namespace conv {
 struct ProblemDescription;
 } // namespace conv
@@ -132,6 +139,17 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionAttribute
         }
     } deterministic;
 
+    class MathType
+    {
+        // temporary set default to pedantic until we fully complete this feature.
+        // TODO:(LYM) change back
+        miopenMathType_t value = miopenMathPedantic;
+        friend struct ConvolutionAttribute;
+
+    public:
+        inline int Get() const { return value; }
+    } math_type;
+
     /// Tri-state attribute values:
     /// * -1: Default (attribute-specific).
     /// * 0: Disabled/Yes.
@@ -350,6 +368,7 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionDescriptor : miopenConvolutionDescript
                                     Data_t dw,
                                     Data_t workSpace,
                                     std::size_t workSpaceSize) const;
+    miopenMathType_t GetMathType() const;
 
     std::size_t spatialDim;
     miopenConvolutionMode_t mode;
@@ -373,6 +392,8 @@ struct MIOPEN_INTERNALS_EXPORT ConvolutionDescriptor : miopenConvolutionDescript
     std::size_t GetSolutionCountFallback(const ExecutionContext& ctx,
                                          const conv::ProblemDescription& problem) const;
 
+    bool EnableTF32() const;
+
     friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv);
     friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv);
 

@@ -42,6 +42,7 @@ struct FusionContext : ExecutionContext
     {
         auto ctx = ExecutionContext{*this};
         conv_problem.SetupFloats(ctx);
+        conv_problem.SetupComputeType(ctx);
         return ctx;
     }
 };