Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 34 additions & 7 deletions clients/benchmarks/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,19 @@ using namespace std::literals;
#include "testing_trsv.hpp"

// Template to dispatch testing_gemm_ex for performance tests
// When Ti == void or complex, the test is marked invalid
// When Ti == void or Ti == To == Tc == bfloat16, the test is marked invalid
template <typename Ti, typename To = Ti, typename Tc = To, typename = void>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not see a test for Ti == complex. Can the comment be updated if there is no test for complex.

Copy link
Copy Markdown
Contributor Author

@daineAMD daineAMD Aug 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The gemm_ex and gemm_strided_batched_ex templates act on exclusion rather than inclusion like the other templates. The exclusion of complex types was removed when complex gemm was added. I added exclusion for bfloat16 types here as they are now permissible in type_dispatch.hpp, but not for gemm_ex. Changed the comment to reflect this in #f746ec6.

struct perf_gemm_ex : rocblas_test_invalid
{
};

template <typename Ti, typename To, typename Tc>
struct perf_gemm_ex<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{}>::type>
struct perf_gemm_ex<Ti,
To,
Tc,
typename std::enable_if<!std::is_same<Ti, void>{}
&& !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
&& std::is_same<Ti, rocblas_bfloat16>{})>::type>
{
explicit operator bool()
{
Expand All @@ -70,17 +75,20 @@ struct perf_gemm_ex<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{
};

// Template to dispatch testing_gemm_strided_batched_ex for performance tests
// When Ti == void or complex, the test is marked invalid
// When Ti == void or Ti == To == Tc == bfloat16, the test is marked invalid
template <typename Ti, typename To = Ti, typename Tc = To, typename = void>
struct perf_gemm_strided_batched_ex : rocblas_test_invalid
{
};

template <typename Ti, typename To, typename Tc>
struct perf_gemm_strided_batched_ex<Ti,
To,
Tc,
typename std::enable_if<!std::is_same<Ti, void>{}>::type>
struct perf_gemm_strided_batched_ex<
Ti,
To,
Tc,
typename std::enable_if<!std::is_same<Ti, void>{}
&& !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
&& std::is_same<Ti, rocblas_bfloat16>{})>::type>
{
explicit operator bool()
{
Expand Down Expand Up @@ -163,6 +171,23 @@ struct perf_blas<
}
};

template <typename T, typename U>
struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_bfloat16>{}>::type>
{
explicit operator bool()
{
return true;
}
void operator()(const Arguments& arg)
{
if(!strcmp(arg.function, "dot"))
testing_dot<T>(arg);
else
throw std::invalid_argument("Invalid combination --function "s + arg.function
+ " --a_type "s + rocblas_datatype2string(arg.a_type));
}
};

template <typename T, typename U>
struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_half>{}>::type>
{
Expand All @@ -174,6 +199,8 @@ struct perf_blas<T, U, typename std::enable_if<std::is_same<T, rocblas_half>{}>:
{
if(!strcmp(arg.function, "axpy"))
testing_axpy<T>(arg);
else if(!strcmp(arg.function, "dot"))
testing_dot<T>(arg);
else if(!strcmp(arg.function, "gemm"))
testing_gemm<T>(arg);
else if(!strcmp(arg.function, "gemm_strided_batched"))
Expand Down
44 changes: 44 additions & 0 deletions clients/common/cblas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,50 @@ void cblas_axpy<rocblas_half>(rocblas_int n,
}
}

template <>
void cblas_dot<rocblas_half>(rocblas_int n,
const rocblas_half* x,
rocblas_int incx,
const rocblas_half* y,
rocblas_int incy,
rocblas_half* result)
{
size_t abs_incx = incx >= 0 ? incx : -incx;
size_t abs_incy = incy >= 0 ? incy : -incy;
host_vector<float> x_float(n * abs_incx);
host_vector<float> y_float(n * abs_incy);

for(size_t i = 0; i < n; i++)
{
x_float[i * abs_incx] = half_to_float(x[i * abs_incx]);
y_float[i * abs_incy] = half_to_float(y[i * abs_incy]);
}

*result = float_to_half(cblas_sdot(n, x_float, incx, y_float, incy));
}

template <>
void cblas_dot<rocblas_bfloat16>(rocblas_int n,
const rocblas_bfloat16* x,
rocblas_int incx,
const rocblas_bfloat16* y,
rocblas_int incy,
rocblas_bfloat16* result)
{
size_t abs_incx = incx >= 0 ? incx : -incx;
size_t abs_incy = incy >= 0 ? incy : -incy;
host_vector<float> x_float(n * abs_incx);
host_vector<float> y_float(n * abs_incy);

for(size_t i = 0; i < n; i++)
{
x_float[i * abs_incx] = float(x[i * abs_incx]);
y_float[i * abs_incy] = float(y[i * abs_incy]);
}

*result = rocblas_bfloat16(cblas_sdot(n, x_float, incx, y_float, incy));
}

/*
* ===========================================================================
* level 2 BLAS
Expand Down
3 changes: 2 additions & 1 deletion clients/gtest/blas1_gtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ namespace
|| std::is_same<Ti, double>{}))

|| (BLAS1 == blas1::dot && std::is_same<Ti, To>{} && std::is_same<To, Tc>{}
&& (std::is_same<Ti, rocblas_float_complex>{}
&& (std::is_same<Ti, rocblas_half>{} || std::is_same<Ti, rocblas_bfloat16>{}
|| std::is_same<Ti, rocblas_float_complex>{}
|| std::is_same<Ti, rocblas_double_complex>{} || std::is_same<Ti, float>{}
|| std::is_same<Ti, double>{}))

Expand Down
4 changes: 2 additions & 2 deletions clients/gtest/blas1_gtest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Tests:
# - iamin: *single_double_precisions_complex_real # broken for now -- cause unknown
- axpy: *half_single_precisions_complex_real
- copy: *single_double_precisions_complex_real
- dot: *single_double_precisions_complex_real
- dot: *half_bfloat_single_double_complex_real_precisions
- dotc: *single_double_precisions_complex
- scal: *single_double_precisions_complex_real
- scal: *single_double_complex_real_in_complex_out
Expand Down Expand Up @@ -68,7 +68,7 @@ Tests:
- iamin_bad_arg: *single_double_precisions_complex_real
- axpy_bad_arg: *half_single_precisions_complex_real
- copy_bad_arg: *single_double_precisions_complex_real
- dot_bad_arg: *single_double_precisions_complex_real
- dot_bad_arg: *half_bfloat_single_double_complex_real_precisions
- dotc_bad_arg: *single_double_precisions_complex
- scal_bad_arg: *single_double_precisions_complex_real
- scal_bad_arg: *single_double_complex_real_in_complex_out
Expand Down
14 changes: 12 additions & 2 deletions clients/gtest/gemm_gtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,11 @@ namespace
// When Ti = To = Tc != void, this test applies.
// When converted to bool, this functor returns true.
template <typename T>
struct gemm_testing<T, T, T, typename std::enable_if<!std::is_same<T, void>{}>::type>
struct gemm_testing<T,
T,
T,
typename std::enable_if<!std::is_same<T, void>{}
&& !std::is_same<T, rocblas_bfloat16>{}>::type>
{
explicit operator bool()
{
Expand Down Expand Up @@ -162,7 +166,13 @@ namespace
// When Ti != void, this test applies.
// When converted to bool, this functor returns true.
template <typename Ti, typename To, typename Tc>
struct gemm_ex_testing<Ti, To, Tc, typename std::enable_if<!std::is_same<Ti, void>{}>::type>
struct gemm_ex_testing<
Ti,
To,
Tc,
typename std::enable_if<!std::is_same<Ti, void>{}
&& !(std::is_same<Ti, To>{} && std::is_same<Ti, Tc>{}
&& std::is_same<Ti, rocblas_bfloat16>{})>::type>
{
explicit operator bool()
{
Expand Down
6 changes: 6 additions & 0 deletions clients/include/rocblas.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ static constexpr auto rocblas_dot<float> = rocblas_sdot;
template <>
static constexpr auto rocblas_dot<double> = rocblas_ddot;

template <>
static constexpr auto rocblas_dot<rocblas_half> = rocblas_hdot;

template <>
static constexpr auto rocblas_dot<rocblas_bfloat16> = rocblas_bfdot;

template <>
static constexpr auto rocblas_dot<rocblas_float_complex> = rocblas_cdotu;

Expand Down
15 changes: 15 additions & 0 deletions clients/include/rocblas_common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Real precisions: &real_precisions
{ a_type: f64_r, b_type: f64_r, c_type: f64_r, d_type: f64_r, compute_type: f64_r }
- &int8_precision
{ a_type: i8_r, b_type: i8_r, c_type: i32_r, d_type: i32_r, compute_type: i32_r }
- &bf16_precision
{ a_type: bf16_r, b_type: bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: bf16_r }
- &hpa_bf16_precision
{ a_type: bf16_r, b_type: bf16_r, c_type: bf16_r, d_type: bf16_r, compute_type: f32_r }

Expand Down Expand Up @@ -173,6 +175,19 @@ Single double joined: &single_double_complex_real_in_complex_out
- *single_precision_complex_real_in_complex_out
- *double_precision_complex_real_in_complex_out

#############################################
# Used for Dot (quick) #
#############################################
Half bfloat single double complex real: &half_bfloat_single_double_complex_real_precisions
- *half_precision
- *bf16_precision
- *single_precision
- *double_precision
- *half_precision_complex
- *single_precision_complex
- *double_precision_complex


# The Arguments struct passed directly to C++. See rocblas_arguments.hpp.
# The order of the entries is significant, so it can't simply be a dictionary.
# The types on the RHS are eval'd for Python-recognized types including ctypes
Expand Down
4 changes: 2 additions & 2 deletions clients/include/testing_dot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ void testing_dot(const Arguments& arg)
std::cout << "cpu=" << cpu_result << ", gpu_host_ptr=" << rocblas_result_1
<< ", gpu_device_ptr=" << rocblas_result_2 << "\n";

rocblas_error_1 = std::abs((cpu_result - rocblas_result_1) / cpu_result);
rocblas_error_2 = std::abs((cpu_result - rocblas_result_2) / cpu_result);
rocblas_error_1 = double(std::abs((cpu_result - rocblas_result_1) / cpu_result));
rocblas_error_2 = double(std::abs((cpu_result - rocblas_result_2) / cpu_result));
}
}

Expand Down
6 changes: 4 additions & 2 deletions clients/include/type_dispatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ auto rocblas_simple_dispatch(const Arguments& arg)
{
case rocblas_datatype_f16_r:
return TEST<rocblas_half>{}(arg);
case rocblas_datatype_bf16_r:
return TEST<rocblas_bfloat16>{}(arg);
case rocblas_datatype_f32_r:
return TEST<float>{}(arg);
case rocblas_datatype_f64_r:
return TEST<double>{}(arg);
// case rocblas_datatype_f16_c:
// return TEST<rocblas_half_complex>{}(arg);
// case rocblas_datatype_f16_c:
// return TEST<rocblas_half_complex>{}(arg);
case rocblas_datatype_f32_c:
return TEST<rocblas_float_complex>{}(arg);
case rocblas_datatype_f64_c:
Expand Down
16 changes: 16 additions & 0 deletions library/include/rocblas-functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,22 @@ ROCBLAS_EXPORT rocblas_status rocblas_ddot(rocblas_handle handle,
rocblas_int incy,
double* result);

ROCBLAS_EXPORT rocblas_status rocblas_hdot(rocblas_handle handle,
rocblas_int n,
const rocblas_half* x,
rocblas_int incx,
const rocblas_half* y,
rocblas_int incy,
rocblas_half* result);

ROCBLAS_EXPORT rocblas_status rocblas_bfdot(rocblas_handle handle,
rocblas_int n,
const rocblas_bfloat16* x,
rocblas_int incx,
const rocblas_bfloat16* y,
rocblas_int incy,
rocblas_bfloat16* result);

ROCBLAS_EXPORT rocblas_status rocblas_cdotu(rocblas_handle handle,
rocblas_int n,
const rocblas_float_complex* x,
Expand Down
9 changes: 9 additions & 0 deletions library/include/rocblas_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,15 @@ inline rocblas_bfloat16 cos(rocblas_bfloat16 a)
return rocblas_bfloat16(cosf(float(a)));
}

// Inject standard functions into namespace std
namespace std
{
__device__ __host__ inline rocblas_bfloat16 abs(const rocblas_bfloat16& z)
{
return rocblas_bfloat16(z.data & 0x7fff);
}
}

#endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__))

#endif // _ROCBLAS_BFLOAT16_H_
2 changes: 1 addition & 1 deletion library/src/blas1/reduction.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ __global__ void rocblas_reduction_kernel_part2(rocblas_int nblocks, To* workspac

// Store result on device or in workspace
if(tx == 0)
*result = FINALIZE{}(tmp[0]);
*result = Tr(FINALIZE{}(tmp[0]));
}

// At least two kernels are needed to finish the reduction
Expand Down
Loading