ROCm · daineAMD · Aug 22, 2019 · Aug 12, 2019 · Aug 12, 2019 · Aug 12, 2019
@@ -50,14 +50,19 @@ using namespace std::literals;
 #include "testing_trsv.hpp"
 
 // Template to dispatch testing_gemm_ex for performance tests
-// When Ti == void or complex, the test is marked invalid
+// When Ti == void or Ti == To == Tc == bfloat16, the test is marked invalid
 template <typename Ti, typename To = Ti, typename Tc = To, typename = void>
 struct perf_gemm_ex : rocblas_test_invalid
 {
 };
 
 template <typename Ti, typename To, typename Tc>
-struct perf_gemm_ex<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{}>::type>
+struct perf_gemm_ex<Ti,
+                    To,
+                    Tc,
+                    typename std::enable_if<!std::is_same<Ti, void>{}
+                                            && !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
+                                                 && std::is_same<Ti, rocblas_bfloat16>{})>::type>
 {
     explicit operator bool()
     {
@@ -70,17 +75,20 @@ struct perf_gemm_ex<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{
 };
 
 // Template to dispatch testing_gemm_strided_batched_ex for performance tests
-// When Ti == void or complex, the test is marked invalid
+// When Ti == void or Ti == To == Tc == bfloat16, the test is marked invalid
 template <typename Ti, typename To = Ti, typename Tc = To, typename = void>
 struct perf_gemm_strided_batched_ex : rocblas_test_invalid
 {
 };
 
 template <typename Ti, typename To, typename Tc>
-struct perf_gemm_strided_batched_ex<Ti,
-                                    To,
-                                    Tc,
-                                    typename std::enable_if<!std::is_same<Ti, void>{}>::type>
+struct perf_gemm_strided_batched_ex<
+    Ti,
+    To,
+    Tc,
+    typename std::enable_if<!std::is_same<Ti, void>{}
+                            && !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
+                                 && std::is_same<Ti, rocblas_bfloat16>{})>::type>
 {
     explicit operator bool()
     {
@@ -163,6 +171,23 @@ struct perf_blas<
     }
 };
 
+template <typename T, typename U>
+struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_bfloat16>{}>::type>
+{
+    explicit operator bool()
+    {
+        return true;
+    }
+    void operator()(const Arguments& arg)
+    {
+        if(!strcmp(arg.function, "dot"))
+            testing_dot<T>(arg);
+        else
+            throw std::invalid_argument("Invalid combination --function "s + arg.function
+                                        + " --a_type "s + rocblas_datatype2string(arg.a_type));
+    }
+};
+
 template <typename T, typename U>
 struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_half>{}>::type>
 {
@@ -174,6 +199,8 @@ struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_half>{}>:
     {
         if(!strcmp(arg.function, "axpy"))
             testing_axpy<T>(arg);
+        else if(!strcmp(arg.function, "dot"))
+            testing_dot<T>(arg);
         else if(!strcmp(arg.function, "gemm"))
             testing_gemm<T>(arg);
         else if(!strcmp(arg.function, "gemm_strided_batched"))

@@ -39,6 +39,50 @@ void cblas_axpy<rocblas_half>(rocblas_int   n,
     }
 }
 
+template <>
+void cblas_dot<rocblas_half>(rocblas_int         n,
+                             const rocblas_half* x,
+                             rocblas_int         incx,
+                             const rocblas_half* y,
+                             rocblas_int         incy,
+                             rocblas_half*       result)
+{
+    size_t             abs_incx = incx >= 0 ? incx : -incx;
+    size_t             abs_incy = incy >= 0 ? incy : -incy;
+    host_vector<float> x_float(n * abs_incx);
+    host_vector<float> y_float(n * abs_incy);
+
+    for(size_t i = 0; i < n; i++)
+    {
+        x_float[i * abs_incx] = half_to_float(x[i * abs_incx]);
+        y_float[i * abs_incy] = half_to_float(y[i * abs_incy]);
+    }
+
+    *result = float_to_half(cblas_sdot(n, x_float, incx, y_float, incy));
+}
+
+template <>
+void cblas_dot<rocblas_bfloat16>(rocblas_int             n,
+                                 const rocblas_bfloat16* x,
+                                 rocblas_int             incx,
+                                 const rocblas_bfloat16* y,
+                                 rocblas_int             incy,
+                                 rocblas_bfloat16*       result)
+{
+    size_t             abs_incx = incx >= 0 ? incx : -incx;
+    size_t             abs_incy = incy >= 0 ? incy : -incy;
+    host_vector<float> x_float(n * abs_incx);
+    host_vector<float> y_float(n * abs_incy);
+
+    for(size_t i = 0; i < n; i++)
+    {
+        x_float[i * abs_incx] = float(x[i * abs_incx]);
+        y_float[i * abs_incy] = float(y[i * abs_incy]);
+    }
+
+    *result = rocblas_bfloat16(cblas_sdot(n, x_float, incx, y_float, incy));
+}
+
 /*
  * ===========================================================================
  *    level 2 BLAS

@@ -83,7 +83,8 @@ namespace
                     || std::is_same<Ti, double>{}))
 
             || (BLAS1 == blas1::dot && std::is_same<Ti, To>{} && std::is_same<To, Tc>{}
-                && (std::is_same<Ti, rocblas_float_complex>{}
+                && (std::is_same<Ti, rocblas_half>{} || std::is_same<Ti, rocblas_bfloat16>{}
+                    || std::is_same<Ti, rocblas_float_complex>{}
                     || std::is_same<Ti, rocblas_double_complex>{} || std::is_same<Ti, float>{}
                     || std::is_same<Ti, double>{}))
 

@@ -34,7 +34,7 @@ Tests:
 #   - iamin: *single_double_precisions_complex_real # broken for now -- cause unknown
     - axpy:  *half_single_precisions_complex_real
     - copy:  *single_double_precisions_complex_real
-    - dot:   *single_double_precisions_complex_real
+    - dot:   *half_bfloat_single_double_complex_real_precisions
     - dotc:  *single_double_precisions_complex
     - scal:  *single_double_precisions_complex_real
     - scal:  *single_double_complex_real_in_complex_out
@@ -68,7 +68,7 @@ Tests:
     - iamin_bad_arg: *single_double_precisions_complex_real
     - axpy_bad_arg:  *half_single_precisions_complex_real
     - copy_bad_arg:  *single_double_precisions_complex_real
-    - dot_bad_arg:   *single_double_precisions_complex_real
+    - dot_bad_arg:   *half_bfloat_single_double_complex_real_precisions
     - dotc_bad_arg:  *single_double_precisions_complex
     - scal_bad_arg:  *single_double_precisions_complex_real
     - scal_bad_arg:  *single_double_complex_real_in_complex_out

@@ -113,7 +113,11 @@ namespace
     // When Ti = To = Tc != void, this test applies.
     // When converted to bool, this functor returns true.
     template <typename T>
-    struct gemm_testing<T, T, T, typename std::enable_if<!std::is_same<T, void>{}>::type>
+    struct gemm_testing<T,
+                        T,
+                        T,
+                        typename std::enable_if<!std::is_same<T, void>{}
+                                                && !std::is_same<T, rocblas_bfloat16>{}>::type>
     {
         explicit operator bool()
         {
@@ -162,7 +166,13 @@ namespace
     // When Ti != void, this test applies.
     // When converted to bool, this functor returns true.
     template <typename Ti, typename To, typename Tc>
-    struct gemm_ex_testing<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{}>::type>
+    struct gemm_ex_testing<
+        Ti,
+        To,
+        Tc,
+        typename std::enable_if<!std::is_same<Ti, void>{}
+                                && !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
+                                     && std::is_same<Ti, rocblas_bfloat16>{})>::type>
     {
         explicit operator bool()
         {

@@ -100,6 +100,12 @@ static constexpr auto rocblas_dot<float> = rocblas_sdot;
 template <>
 static constexpr auto rocblas_dot<double> = rocblas_ddot;
 
+template <>
+static constexpr auto rocblas_dot<rocblas_half> = rocblas_hdot;
+
+template <>
+static constexpr auto rocblas_dot<rocblas_bfloat16> = rocblas_bfdot;
+
 template <>
 static constexpr auto rocblas_dot<rocblas_float_complex> = rocblas_cdotu;
 

@@ -40,6 +40,8 @@ Real precisions: &real_precisions
     { a_type: f64_r, b_type: f64_r, c_type: f64_r, d_type: f64_r, compute_type: f64_r }
   - &int8_precision
     { a_type:  i8_r, b_type:  i8_r, c_type: i32_r, d_type: i32_r, compute_type: i32_r }
+  - &bf16_precision
+    { a_type:  bf16_r, b_type:  bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: bf16_r }
   - &hpa_bf16_precision
     { a_type:  bf16_r, b_type:  bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: f32_r }
 
@@ -173,6 +175,19 @@ Single double joined: &single_double_complex_real_in_complex_out
   - *single_precision_complex_real_in_complex_out
   - *double_precision_complex_real_in_complex_out
 
+#############################################
+#           Used for Dot (quick)            #
+#############################################
+Half bfloat single double complex real: &half_bfloat_single_double_complex_real_precisions
+  - *half_precision
+  - *bf16_precision
+  - *single_precision
+  - *double_precision
+  - *half_precision_complex
+  - *single_precision_complex
+  - *double_precision_complex
+
+
 # The Arguments struct passed directly to C++. See rocblas_arguments.hpp.
 # The order of the entries is significant, so it can't simply be a dictionary.
 # The types on the RHS are eval'd for Python-recognized types including ctypes

@@ -156,8 +156,8 @@ void testing_dot(const Arguments& arg)
             std::cout << "cpu=" << cpu_result << ", gpu_host_ptr=" << rocblas_result_1
                       << ", gpu_device_ptr=" << rocblas_result_2 << "\n";
 
-            rocblas_error_1 = std::abs((cpu_result - rocblas_result_1) / cpu_result);
-            rocblas_error_2 = std::abs((cpu_result - rocblas_result_2) / cpu_result);
+            rocblas_error_1 = double(std::abs((cpu_result - rocblas_result_1) / cpu_result));
+            rocblas_error_2 = double(std::abs((cpu_result - rocblas_result_2) / cpu_result));
         }
     }
 

@@ -22,12 +22,14 @@ auto rocblas_simple_dispatch(const Arguments& arg)
     {
     case rocblas_datatype_f16_r:
         return TEST<rocblas_half>{}(arg);
+    case rocblas_datatype_bf16_r:
+        return TEST<rocblas_bfloat16>{}(arg);
     case rocblas_datatype_f32_r:
         return TEST<float>{}(arg);
     case rocblas_datatype_f64_r:
         return TEST<double>{}(arg);
-        //  case rocblas_datatype_f16_c:
-        //      return TEST<rocblas_half_complex>{}(arg);
+    //  case rocblas_datatype_f16_c:
+    //      return TEST<rocblas_half_complex>{}(arg);
     case rocblas_datatype_f32_c:
         return TEST<rocblas_float_complex>{}(arg);
     case rocblas_datatype_f64_c:

@@ -209,6 +209,22 @@ ROCBLAS_EXPORT rocblas_status rocblas_ddot(rocblas_handle handle,
                                            rocblas_int    incy,
                                            double*        result);
 
+ROCBLAS_EXPORT rocblas_status rocblas_hdot(rocblas_handle      handle,
+                                           rocblas_int         n,
+                                           const rocblas_half* x,
+                                           rocblas_int         incx,
+                                           const rocblas_half* y,
+                                           rocblas_int         incy,
+                                           rocblas_half*       result);
+
+ROCBLAS_EXPORT rocblas_status rocblas_bfdot(rocblas_handle          handle,
+                                            rocblas_int             n,
+                                            const rocblas_bfloat16* x,
+                                            rocblas_int             incx,
+                                            const rocblas_bfloat16* y,
+                                            rocblas_int             incy,
+                                            rocblas_bfloat16*       result);
+
 ROCBLAS_EXPORT rocblas_status rocblas_cdotu(rocblas_handle               handle,
                                             rocblas_int                  n,
                                             const rocblas_float_complex* x,

@@ -254,6 +254,15 @@ inline rocblas_bfloat16 cos(rocblas_bfloat16 a)
     return rocblas_bfloat16(cosf(float(a)));
 }
 
+// Inject standard functions into namespace std
+namespace std
+{
+    __device__ __host__ inline rocblas_bfloat16 abs(const rocblas_bfloat16& z)
+    {
+        return rocblas_bfloat16(z.data & 0x7fff);
+    }
+}
+
 #endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))
 
 #endif // _ROCBLAS_BFLOAT16_H_
@@ -215,7 +215,7 @@ __global__ void rocblas_reduction_kernel_part2(rocblas_int nblocks, To* workspac
 
     // Store result on device or in workspace
     if(tx == 0)
-        *result = FINALIZE{}(tmp[0]);
+        *result = Tr(FINALIZE{}(tmp[0]));
 }
 
 // At least two kernels are needed to finish the reduction