From 3abdaf55233beaf8cc0b2f014f85ffff547432ac Mon Sep 17 00:00:00 2001
From: Torre Zuk <42548444+TorreZuk@users.noreply.github.com>
Date: Fri, 21 Nov 2025 14:57:10 -0700
Subject: [PATCH 1/2] revise syrk_ex tolerance

---
 .../clients/common/cblas_interface.cpp        | 58 ++++++++++++++-----
 .../include/blas_ex/testing_syrk_ex.hpp       |  4 +-
 projects/rocblas/clients/include/near.hpp     |  8 +++
 3 files changed, 52 insertions(+), 18 deletions(-)
diff --git a/projects/rocblas/clients/common/cblas_interface.cpp b/projects/rocblas/clients/common/cblas_interface.cpp
index 84e070d756a..5335cfd8753 100644
--- a/projects/rocblas/clients/common/cblas_interface.cpp
+++ b/projects/rocblas/clients/common/cblas_interface.cpp
@@ -2242,26 +2242,52 @@ void ref_syrk_ex(rocblas_fill      uplo,
                  U*                C,
                  int64_t           ldc)
 {
-    float alpha_float = alpha;
-    float beta_float  = beta;
+    if constexpr(!std::is_same_v<Tc, double>)
+    {
+        float alpha_float = alpha;
+        float beta_float  = beta;
 
-    host_vector<float> A_float, C_float;
+        host_vector<float> A_float, C_float;
 
-    cast_to_buffer(transA, n, k, lda, A, A_float);
-    cast_to_buffer(rocblas_operation_none, n, n, ldc, C, C_float);
+        cast_to_buffer(transA, n, k, lda, A, A_float);
+        cast_to_buffer(rocblas_operation_none, n, n, ldc, C, C_float);
 
-    ref_syrk(uplo,
-             transA,
-             n,
-             k,
-             alpha_float,
-             (const float*)A_float.data(),
-             lda,
-             beta_float,
-             C_float.data(),
-             ldc);
+        ref_syrk(uplo,
+                transA,
+                n,
+                k,
+                alpha_float,
+                (const float*)A_float.data(),
+                lda,
+                beta_float,
+                C_float.data(),
+                ldc);
+
+        cast_from_buffer(n, n, ldc, C_float, C);
+    }
+    else
+    {
+        double alpha_double = alpha;
+        double beta_double  = beta;
+
+        host_vector<double> A_double, C_double;
+
+        cast_to_buffer(transA, n, k, lda, A, A_double);
+        cast_to_buffer(rocblas_operation_none, n, n, ldc, C, C_double);
 
-    cast_from_buffer(n, n, ldc, C_float, C);
+        ref_syrk(uplo,
+                transA,
+                n,
+                k,
+                alpha_double,
+                (const double*)A_double.data(),
+                lda,
+                beta_double,
+                C_double.data(),
+                ldc);
+
+        cast_from_buffer(n, n, ldc, C_double, C);
+    }
 }
 
 #define INSTANTIATE_SYRK_EX_TEMPLATE(T_, U_, Tc_)                    \
diff --git a/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp b/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
index 373da18e692..39e3a86415b 100644
--- a/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
+++ b/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
@@ -372,8 +372,8 @@ void testing_syrk_ex(const Arguments& arg)
                     // reference is computed on floats
                     double tol = rocblas_handle(handle)->getArchMajor() == 11
                                      ? sum_error_tolerance_for_gfx11<Tex, Ti, To>
-                                     : sum_error_tolerance<Ti>;
-                    tol *= K * 4;
+                                     : sum_error_tolerance<Tex>;
+                    tol = tol * K + sum_error_tolerance<To>; // add To conversion rounding error
                     near_check_general<To, To_hpa>(N, N, ldc, hC_gold, hC, tol);
                 }
                 else
diff --git a/projects/rocblas/clients/include/near.hpp b/projects/rocblas/clients/include/near.hpp
index 804fbb2e39f..307809f8fcd 100644
--- a/projects/rocblas/clients/include/near.hpp
+++ b/projects/rocblas/clients/include/near.hpp
@@ -86,6 +86,14 @@ template <>
 inline constexpr double
     sum_error_tolerance_for_gfx11<rocblas_half, rocblas_half, rocblas_half> = 1 / 100.0;
 
+template <> // syrk_ex use
+inline constexpr double sum_error_tolerance_for_gfx11<double, float, float>
+    = get_epsilon<float>();
+
+template <> // syrk_ex use
+inline constexpr double sum_error_tolerance_for_gfx11<double, float, double>
+    = get_epsilon<float>();
+
 template <>
 inline constexpr double sum_error_tolerance_for_gfx11<rocblas_float_complex,
                                                       rocblas_float_complex,

From 7ab567c2f2a05de1b24b90962adc7663f15e2382 Mon Sep 17 00:00:00 2001
From: Torre Zuk <Torre.Zuk@amd.com>
Date: Fri, 21 Nov 2025 16:30:01 -0700
Subject: [PATCH 2/2] revised tolerance for non gfx11

---
 .../clients/common/cblas_interface.cpp        | 36 +++++++++----------
 .../include/blas_ex/testing_syrk_ex.hpp       |  4 +--
 projects/rocblas/clients/include/near.hpp     |  6 ++--
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/projects/rocblas/clients/common/cblas_interface.cpp b/projects/rocblas/clients/common/cblas_interface.cpp
index 5335cfd8753..ebe2612bbcb 100644
--- a/projects/rocblas/clients/common/cblas_interface.cpp
+++ b/projects/rocblas/clients/common/cblas_interface.cpp
@@ -2253,15 +2253,15 @@ void ref_syrk_ex(rocblas_fill      uplo,
         cast_to_buffer(rocblas_operation_none, n, n, ldc, C, C_float);
 
         ref_syrk(uplo,
-                transA,
-                n,
-                k,
-                alpha_float,
-                (const float*)A_float.data(),
-                lda,
-                beta_float,
-                C_float.data(),
-                ldc);
+                 transA,
+                 n,
+                 k,
+                 alpha_float,
+                 (const float*)A_float.data(),
+                 lda,
+                 beta_float,
+                 C_float.data(),
+                 ldc);
 
         cast_from_buffer(n, n, ldc, C_float, C);
     }
@@ -2276,15 +2276,15 @@ void ref_syrk_ex(rocblas_fill      uplo,
         cast_to_buffer(rocblas_operation_none, n, n, ldc, C, C_double);
 
         ref_syrk(uplo,
-                transA,
-                n,
-                k,
-                alpha_double,
-                (const double*)A_double.data(),
-                lda,
-                beta_double,
-                C_double.data(),
-                ldc);
+                 transA,
+                 n,
+                 k,
+                 alpha_double,
+                 (const double*)A_double.data(),
+                 lda,
+                 beta_double,
+                 C_double.data(),
+                 ldc);
 
         cast_from_buffer(n, n, ldc, C_double, C);
     }
diff --git a/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp b/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
index 39e3a86415b..27fe010ea67 100644
--- a/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
+++ b/projects/rocblas/clients/include/blas_ex/testing_syrk_ex.hpp
@@ -372,8 +372,8 @@ void testing_syrk_ex(const Arguments& arg)
                     // reference is computed on floats
                     double tol = rocblas_handle(handle)->getArchMajor() == 11
                                      ? sum_error_tolerance_for_gfx11<Tex, Ti, To>
-                                     : sum_error_tolerance<Tex>;
-                    tol = tol * K + sum_error_tolerance<To>; // add To conversion rounding error
+                                     : 4 * sum_error_tolerance<Ti>;
+                    tol = tol * K + 2 * sum_error_tolerance<To>; // add To conversion rounding error
                     near_check_general<To, To_hpa>(N, N, ldc, hC_gold, hC, tol);
                 }
                 else
diff --git a/projects/rocblas/clients/include/near.hpp b/projects/rocblas/clients/include/near.hpp
index 307809f8fcd..5f389b2349f 100644
--- a/projects/rocblas/clients/include/near.hpp
+++ b/projects/rocblas/clients/include/near.hpp
@@ -87,12 +87,10 @@ inline constexpr double
     sum_error_tolerance_for_gfx11<rocblas_half, rocblas_half, rocblas_half> = 1 / 100.0;
 
 template <> // syrk_ex use
-inline constexpr double sum_error_tolerance_for_gfx11<double, float, float>
-    = get_epsilon<float>();
+inline constexpr double sum_error_tolerance_for_gfx11<double, float, float> = get_epsilon<float>();
 
 template <> // syrk_ex use
-inline constexpr double sum_error_tolerance_for_gfx11<double, float, double>
-    = get_epsilon<float>();
+inline constexpr double sum_error_tolerance_for_gfx11<double, float, double> = get_epsilon<float>();
 
 template <>
 inline constexpr double sum_error_tolerance_for_gfx11<rocblas_float_complex,