ROCm · yingluAMD · Jan 19, 2026 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
@@ -156,6 +156,10 @@ pipeline {
             name: "DATATYPE_FP32",
             defaultValue: true,
             description: "")
+        booleanParam(
+            name: "DATATYPE_TF32",
+            defaultValue: true,
+            description: "")
         booleanParam(
             name: "DATATYPE_FP16",
             defaultValue: true,
@@ -199,6 +203,7 @@ pipeline {
         Bf16_flags      = " -DMIOPEN_TEST_BFLOAT16=On"
         Int8_flags      = " -DMIOPEN_TEST_INT8=On"
         Full_test       = " -DMIOPEN_TEST_ALL=On"
+        Tf32_flags      = " -DMIOPEN_TEST_TF32=On"
 
         gfx908_flags    = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx908"
         gfx90a_flags    = " -DMIOPEN_INSTALL_GPU_DATABASES=gfx90a"
@@ -499,6 +504,24 @@ pipeline {
                         }
                     }
                 }
+                stage('TF32 Hip Install All gfx942') {
+                    when {
+                        beforeAgent true
+                        expression { params.TARGET_GFX942 && params.DATATYPE_TF32 }
+                    }
+                    options {
+                        retry(2)
+                    }
+                    agent{ label rocmnode("gfx942") }
+                    steps{
+                        runBuildAndSingleGtestJob(Full_test + Tf32_flags + gfx942_flags, Build_timeout_minutes)
+                    }
+                    post {
+                        always {
+                            cleanWs()
+                        }
+                    }
+                }
                 // GFX1101 Tests
                 stage('Fp16 Hip Install All gfx1101') {
                     when {

@@ -452,6 +452,13 @@ class ConvDriver : public Driver
         constexpr bool is_bfp8 = std::is_same<Tgpu, bfloat8_fnuz>::value;
         if(is_bfp8 || is_fp8 || TensorsCasted())
             tolerance *= 37.0;
+
+        { // tf32 has same mantissa length as fp16
+            auto math_type_ = inflags.GetValueInt("math_type");
+            if(std::is_same_v<Tgpu, float> &&
+               (miopen::EnvEnableTF32() || (math_type_ == miopenMathDefault)))
+                tolerance = 8.2e-3;
+        }
         return tolerance;
     }
 
@@ -864,6 +871,8 @@ int ConvDriver<Tgpu, Tref>::GetandSetData()
             warmupConvDesc,
             static_cast<int>(miopenConvolutionFindModeNormal)); // Repeat via hidden API.
         miopenSetConvolutionGroupCount(warmupConvDesc, group_count);
+        miopenSetConvolutionAttribute(
+            warmupConvDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, inflags.GetValueInt("math_type"));
 
         int warmup_out_len_size = miopen::deref(warmupInputTensor).GetNumDims();
         std::vector<int> warmup_out_len(warmup_out_len_size);
@@ -1018,6 +1027,8 @@ int ConvDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "0",
                          "MIOpen tuning policy (Default=0, or no tuning policy set)",
                          "int");
+    // TODO:(LYM) change back to 0 when TF32 is fully supported
+    inflags.AddInputFlag("math_type", 'M', "1", "math type of compute (Default=1)", "int");
 
     return 0;
 }
@@ -1222,6 +1233,14 @@ int ConvDriver<Tgpu, Tref>::SetConvDescriptorFromCmdLineArgs()
         miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data());
     }
 
+    auto math_type_ = inflags.GetValueInt("math_type");
+    if(math_type_ < miopenMathDefault || math_type_ > miopenMathPedantic)
+    {
+        std::cout << "Invalid math_type value: " << math_type_ << std::endl;
+        exit(0); // NOLINT (concurrency-mt-unsafe)
+    }
+    miopenSetConvolutionAttribute(convDesc, MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE, math_type_);
+
     return miopenStatusSuccess;
 }
 

@@ -120,6 +120,14 @@ typedef enum
     miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */
 } miopenStatus_t;
 
+typedef enum
+{
+    // TODO:(LYM) temporary use Pedantic as default until TF32 is fully supported
+    miopenMathDefault = 0, /*!< Use TF32 if possible */
+    miopenMathPedantic =
+        1, /*!< Default MathType. Strict IEEE compliance. Don't allow datatype down conversion. */
+} miopenMathType_t;
+
 #ifdef MIOPEN_BETA_API
 typedef enum
 {
@@ -639,6 +647,9 @@ typedef enum
 #else
 // miopenReserved1 = 2,
 #endif
+    // TODO:(LYM) temporarily use Pedantic as default until TF32 is fully supported
+    MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE =
+        3, /*!< refer to miopenMathType_t,default is miopenMathPedantic >*/
 } miopenConvolutionAttrib_t;
 
 /*! @ingroup convolutions

@@ -936,6 +936,9 @@ if(MIOPEN_USE_COMPOSABLEKERNEL)
         # Use the aliased targets when we pull CK from /opt/rocm or other place on disc.
         set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_conv_operations hip::host)
     endif()
+    if(GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950")
+        target_compile_definitions(MIOpen PRIVATE CK_ENABLE_TF32)
+    endif()
 endif()
 
 if(WIN32)

@@ -31,6 +31,7 @@
 #include <miopen/datatype.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/tensor_layout.hpp>
+#include <miopen/solver/static_ck_common.hpp>
 
 #include <sstream>
 
@@ -215,9 +216,14 @@ void ProblemDescription::MakeNetworkConfig(std::string& conf_key) const
         ss << 'x' << GetWeightsLayout();
         ss << 'x' << GetOutLayout();
     }
-    ss << 'x' << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    const auto data_type =
+        EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    ss << 'x' << data_type;
 
     std::ostringstream optional;
+    if(data_type == "FP32" && UseTF32())
+        optional << "TF32" << 'x';
+
     if(const auto ct = GetInCastType())
         optional << "ci" << GetDataTypeName(*ct);
     if(const auto ct = GetWeightsCastType())
@@ -271,10 +277,12 @@ void ProblemDescription::Serialize(std::ostream& stream) const
         stream << sep << GetWeightsLayout();
         stream << sep << GetOutLayout();
     }
-    stream << sep << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    // clang-format on
+    const auto data_type =
+        EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+    stream << sep << data_type;
     stream << sep << GetDirectionStr();
 
-    // clang-format on
     // New performance config entries shall come into variable/optional part of db key.
     // This is to support backward compatibility with previous versions of databases.
     std::ostringstream optional;
@@ -290,6 +298,10 @@ void ProblemDescription::Serialize(std::ostream& stream) const
         if(const auto ct = GetOutCastType())
             optional << "_co" << GetDataTypeName(*ct);
 
+        // cx indicates compute datatype
+        if(data_type == "FP32" && UseTF32())
+            optional << "_cxTF32";
+
         SerializeStrides(optional, in, out, weights, sep);
     }
     if(!optional.str().empty())
@@ -350,6 +362,15 @@ void ProblemDescription::SetupFloats(ExecutionContext& ctx) const
                  << "x" << GetDataTypeName(GetOutDataType()));
 }
 
+void ProblemDescription::SetupComputeType(const ExecutionContext& ctx) const
+{
+    if(miopen::solver::static_ck::IsTF32Supported(ctx.GetStream().GetDeviceName()) &&
+       conv.EnableTF32())
+    {
+        use_tf32 = true;
+    }
+}
+
 std::string ProblemDescription::ComputeLayout(const TensorDescriptor& td) const
 {
     return td.GetLayout_str();

@@ -330,7 +330,8 @@ std::vector<Solution> EvaluateInvokers(const Handle& handle,
 
             MIOPEN_THROW_IF(elapsed <= 0, "Invalid elapsed time detected in EvaluateInvokers");
 
-            MIOPEN_LOG_I(sol << ": " << elapsed << (elapsed < best ? " < " : " >= ") << best);
+            MIOPEN_LOG_I("solution(current vs best):" << sol << ": " << elapsed
+                                                      << (elapsed < best ? " < " : " >= ") << best);
             if(elapsed < best)
             {
                 best                              = elapsed;

@@ -468,6 +468,16 @@ std::size_t ConvolutionDescriptor::GetWorkSpaceSize(ExecutionContext ctx,
     return workspace_size;
 }
 
+bool ConvolutionDescriptor::EnableTF32() const
+{
+    // TODO:(LYM) change back to && when TF32 is fully supported
+    if((miopen::EnvEnableTF32() ||
+        (static_cast<miopenMathType_t>(attribute.Get(MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)) ==
+         miopenMathDefault)))
+        return true;
+    return false;
+}
+
 std::ostream& operator<<(std::ostream& stream, const ConvolutionDescriptor& c)
 {
     stream << "conv" << c.spatialDim << "d, ";
@@ -540,6 +550,18 @@ void ConvolutionAttribute::Set(miopenConvolutionAttrib_t attr, int value)
         }
         fp8rounding_mode.rounding_mode = rounding_mode;
     }
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)
+    {
+        const auto math_type_ = static_cast<miopenMathType_t>(value);
+        if(math_type_ != miopenMathDefault && math_type_ != miopenMathPedantic)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "[Set conv attribute] Error: Attempt to set invalid value for "
+                         "MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE: " +
+                             std::to_string(value));
+        }
+        math_type.value = math_type_;
+    }
     else
     {
         MIOPEN_THROW(miopenStatusBadParm,
@@ -556,6 +578,8 @@ int ConvolutionAttribute::Get(miopenConvolutionAttrib_t attr) const
         return static_cast<int>(fp8rounding_mode.rounding_mode);
     else if(attr == MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC)
         return deterministic.value;
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_MATH_TYPE)
+        return math_type.value;
     MIOPEN_THROW(miopenStatusBadParm,
                  "[Get conv attribute] Error: Attribute [" +
                      std::to_string(static_cast<int>(attr)) + "] does not exist.");

@@ -67,6 +67,7 @@ static inline auto MakeFwdCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 
@@ -86,6 +87,7 @@ static inline auto MakeBwdCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 
@@ -111,6 +113,7 @@ static inline auto MakeWrWCtxAndProblem(miopenHandle_t handle,
 
     auto ctx = ExecutionContext{&miopen::deref(handle)};
     problem.SetupFloats(ctx);
+    problem.SetupComputeType(ctx);
     return std::make_tuple(std::move(ctx), std::move(problem));
 }
 

@@ -295,6 +295,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
         return GetInCastType() || GetWeightsCastType() || GetOutCastType();
     }
 
+    bool UseTF32() const { return use_tf32; }
+
     // To be used in Solvers that do not implement ALT FP16 kernels.
     // Those Solvers must be non-applicable for gfx90a when this function returns true.
     bool IsGfx90aFp16altRequired() const
@@ -395,6 +397,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
             self.GetInDataType(), self.GetWeightsDataType(), self.GetOutDataType());
         f(data_type, "data_type");
         f(self.GetDirectionStr(), "direction");
+        if(data_type == "FP32" && self.UseTF32())
+            f("TF32", "compute_datatype");
     }
 
     template <class Self, class Visitor>
@@ -406,6 +410,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     }
 
     void SetupFloats(ExecutionContext& ctx) const;
+    void SetupComputeType(const ExecutionContext& ctx) const;
 
 private:
     std::string ComputeLayout(const TensorDescriptor& td) const;
@@ -425,6 +430,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase
     Scalar alpha                          = Scalar(1.0);
     Scalar beta                           = Scalar(0.0);
     miopenAlphaBetaCase_t alpha_beta_case = DEFAULT;
+    mutable bool use_tf32                 = false;
 };
 
 } // namespace conv