From 598a9e9cfd76fede4f1899e7ae927e8cee81ebab Mon Sep 17 00:00:00 2001 From: jzuniga-amd Date: Fri, 9 Aug 2019 12:22:27 -0600 Subject: [PATCH 1/5] Enable unit tests for gemv_batched and gemv_strided_batched - Add function templates for gemv_strided_batched and gemv_batched (in rocblas_gemv.hpp) to enable correct calls of these functions from other functions or from outside rocblas. - Add batch and strides checks and quick return in rocblas_gemv_batched.cpp and rocblas_gemv_strided_batched.cpp - Add unit tests testing_gemv_batched.hpp and testing_gemv_strided_batched.hpp - Add new class device_batch_vector in rocblas_vector.hpp. Needed for the batched case. - Add new template headers to rocblas.hpp - Add new template header and especializations for norm_check_general to work with the batched case (in norm.hpp and norm.cpp) - Add new template and espcializations for unit_check_general to work with the batched case (in unit.hpp) - Add new arguments, stride_x and stride_y (needed to test gemv_strided_batched) in rocblas_arguments.hpp and rocblas_common.yaml. Set stride_x and stride_y defaults to zero in rocblas_common.yaml to correctly generate the tests of those functions that do not need these arguments - Include the new tests in client.cpp as well as a description of the new arguments - Add the new functions in rocblas_template.yaml to process YAML from log files - Add batched and strided_batched template test cases in gemv_gtest.cpp - Add new yaml test-data files gemv_batched_gtest.yaml and gemv_strided_batched_gtest.yaml - Include the new yaml files in rocblas_gtest.yaml - Add the new yaml files to the list of dependencies for rocblas_gtest.data in CMakeLists.txt --- clients/benchmarks/client.cpp | 16 + clients/common/norm.cpp | 274 ++++++++++- clients/gtest/CMakeLists.txt | 2 +- clients/gtest/gemv_batched_gtest.yaml | 91 ++++ clients/gtest/gemv_gtest.cpp | 111 ++++- clients/gtest/gemv_strided_batched_gtest.yaml | 91 ++++ clients/gtest/rocblas_gtest.yaml | 2 + clients/include/norm.hpp | 13 +- clients/include/rocblas.hpp | 59 +++ clients/include/rocblas_arguments.hpp | 7 + clients/include/rocblas_common.yaml | 4 + clients/include/rocblas_template.yaml | 4 + clients/include/rocblas_vector.hpp | 158 ++++--- clients/include/testing_gemv.hpp | 4 +- clients/include/testing_gemv_batched.hpp | 360 ++++++++++++++ .../include/testing_gemv_strided_batched.hpp | 442 ++++++++++++++++++ clients/include/unit.hpp | 89 ++++ library/src/blas2/gemv_device.hpp | 1 - library/src/blas2/rocblas_gemv.cpp | 1 - library/src/blas2/rocblas_gemv.hpp | 365 +++++++++++++++ library/src/blas2/rocblas_gemv_batched.cpp | 173 +------ .../blas2/rocblas_gemv_strided_batched.cpp | 218 ++------- 22 files changed, 2055 insertions(+), 430 deletions(-) create mode 100644 clients/gtest/gemv_batched_gtest.yaml create mode 100644 clients/gtest/gemv_strided_batched_gtest.yaml create mode 100644 clients/include/testing_gemv_batched.hpp create mode 100644 clients/include/testing_gemv_strided_batched.hpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 4ff1679e5..6181075fa 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -13,6 +13,8 @@ #include "testing_dot.hpp" #include "testing_geam.hpp" #include "testing_gemv.hpp" +#include "testing_gemv_batched.hpp" +#include "testing_gemv_strided_batched.hpp" #include "testing_ger.hpp" #include "testing_iamax_iamin.hpp" #include "testing_nrm2.hpp" @@ -141,6 +143,10 @@ struct perf_blas< testing_nrm2(arg); else if(!strcmp(arg.function, "gemv")) testing_gemv(arg); + else if(!strcmp(arg.function, "gemv_batched")) + testing_gemv_batched(arg); + else if(!strcmp(arg.function, "gemv_strided_batched")) + testing_gemv_strided_batched(arg); else if(!strcmp(arg.function, "ger")) testing_ger(arg); else if(!strcmp(arg.function, "syr")) @@ -500,6 +506,16 @@ try "Specific stride of strided_batched matrix D, is only applicable to strided batched" "BLAS_EX: second dimension * leading dimension.") + ("stride_x", + value(&arg.stride_x)->default_value(128*128), + "Specific stride of strided_batched vector x, is only applicable to strided batched" + "BLAS_2: second dimension.") + + ("stride_y", + value(&arg.stride_y)->default_value(128*128), + "Specific stride of strided_batched vector y, is only applicable to strided batched" + "BLAS_2: leading dimension.") + ("incx", value(&arg.incx)->default_value(1), "increment between values in x vector") diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp index b350e6c2d..e7760d42a 100644 --- a/clients/common/norm.cpp +++ b/clients/common/norm.cpp @@ -355,7 +355,6 @@ double norm_check_general(char norm_type, return cumulative_error; } -//=====Norm Check for strided_batched matrix template <> double norm_check_general(char norm_type, rocblas_int M, @@ -506,6 +505,279 @@ double norm_check_general(char norm_type, return cumulative_error; } +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int stride_a, + rocblas_int batch_count, + rocblas_float_complex* hCPU, + rocblas_float_complex* hGPU) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + float work; + rocblas_int incx = 1; + float alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + float cpu_norm = clange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work); + + caxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx); + + float error = clange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int stride_a, + rocblas_int batch_count, + rocblas_double_complex* hCPU, + rocblas_double_complex* hGPU) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + double work; + rocblas_int incx = 1; + double alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + double cpu_norm = zlange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work); + + zaxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx); + + double error = zlange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + +//=====Norm Check for batched matrix +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int batch_count, + host_vector hCPU[], + host_vector hGPU[]) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + float work; + rocblas_int incx = 1; + float alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + float cpu_norm = slange_(&norm_type, &M, &N, hCPU[i], &lda, &work); + + saxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx); + + float error = slange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int batch_count, + host_vector hCPU[], + host_vector hGPU[]) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + double work; + rocblas_int incx = 1; + double alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + double cpu_norm = dlange_(&norm_type, &M, &N, hCPU[i], &lda, &work); + + daxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx); + + double error = dlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int batch_count, + host_vector hCPU[], + host_vector hGPU[]) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + float work; + rocblas_int incx = 1; + float alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + float cpu_norm = clange_(&norm_type, &M, &N, hCPU[i], &lda, &work); + + caxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx); + + float error = clange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + +template <> +double norm_check_general(char norm_type, + rocblas_int M, + rocblas_int N, + rocblas_int lda, + rocblas_int batch_count, + host_vector hCPU[], + host_vector hGPU[]) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + double work; + rocblas_int incx = 1; + double alpha = -1.0f; + rocblas_int size = lda * N; + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + double cpu_norm = zlange_(&norm_type, &M, &N, hCPU[i], &lda, &work); + + zaxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx); + + double error = zlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm; + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + /* ============================Norm Check for Symmetric Matrix: float/double/complex template * speciliazation ======================================= */ diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index a6c602bda..6f073afe1 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -113,7 +113,7 @@ endif( ) set( ROCBLAS_TEST_DATA "${PROJECT_BINARY_DIR}/staging/rocblas_gtest.data") add_custom_command( OUTPUT "${ROCBLAS_TEST_DATA}" COMMAND ../common/rocblas_gentest.py -I ../include rocblas_gtest.yaml -o "${ROCBLAS_TEST_DATA}" - DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml + DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml gemv_batched_gtest.yaml gemv_strided_batched_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ) add_custom_target( rocblas-test-data DEPENDS "${ROCBLAS_TEST_DATA}" ) diff --git a/clients/gtest/gemv_batched_gtest.yaml b/clients/gtest/gemv_batched_gtest.yaml new file mode 100644 index 000000000..e0eeedd3d --- /dev/null +++ b/clients/gtest/gemv_batched_gtest.yaml @@ -0,0 +1,91 @@ +--- +include: rocblas_common.yaml +include: known_bugs.yaml + +Definitions: + - &small_matrix_size_range + - { M: -1, N: 1, lda: 1 } + - { M: 1, N: -1, lda: 1 } + - { M: 1, N: 1, lda: 0 } + - { M: 10, N: 10, lda: 9 } + - { M: 0, N: 1, lda: 1 } + - { M: 1, N: 0, lda: 1 } + - { M: -1, N: -1, lda: -1 } + - { M: 10, N: 10, lda: 2 } + - { M: 100, N: 200, lda: 200 } + + - &medium_matrix_size_range + - { M: 300, N: 400, lda: 400 } + - { M: 600, N: 500, lda: 601 } + + - &large_matrix_size_range + - { M: 1000, N: 1000, lda: 1000 } + - { M: 2000, N: 2000, lda: 2000 } + - { M: 4011, N: 4011, lda: 4011 } + - { M: 8000, N: 8000, lda: 8000 } + + - &incx_incy_range + - { incx: 2, incy: 1 } + - { incx: -1, incy: 2 } + - { incx: 1, incy: 1 } + - { incx: -1, incy: 3 } + - { incx: 3, incy: -1 } + - { incx: 0, incy: 1 } + - { incx: 1, incy: 0 } + - { incx: 0, incy: -1 } + - { incx: 10, incy: 100 } + + - &alpha_beta_range + - { alpha: 2.0, beta: 0.0 } + - { alpha: -1.0, beta: -1.0 } + - { alpha: 2.0, beta: 1.0 } + - { alpha: 0.0, beta: 1.0 } + +Tests: +- name: gemv_batched_bad_arg + category: pre_checkin + function: gemv_batched_bad_arg + precision: *single_double_precisions + transA: N + +- name: gemv_batched_NaN + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha: [ -1.0, 0, 1.0, 2.0 ] + beta: .NaN # converted to 0.0 in test code + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_small + category: quick + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *small_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_medium + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions_complex_real + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] + +- name: gemv_batched_large + category: nightly + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *large_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] +... diff --git a/clients/gtest/gemv_gtest.cpp b/clients/gtest/gemv_gtest.cpp index ff86b426d..fd9823333 100644 --- a/clients/gtest/gemv_gtest.cpp +++ b/clients/gtest/gemv_gtest.cpp @@ -6,6 +6,8 @@ #include "rocblas_datatype2string.hpp" #include "rocblas_test.hpp" #include "testing_gemv.hpp" +#include "testing_gemv_batched.hpp" +#include "testing_gemv_strided_batched.hpp" #include "type_dispatch.hpp" #include #include @@ -13,6 +15,69 @@ namespace { + // possible gemv test cases + enum gemv_test_type + { + GEMV, + GEMV_BATCHED, + GEMV_STRIDED_BATCHED, + }; + + //gemv test template + template