diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..2ce99d5bb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "extern/blis"] + path = extern/blis + url = https://github.com/amd/blis.git + branch = 1.3 diff --git a/.jenkins/Dependency b/.jenkins/Dependency new file mode 100644 index 000000000..c668975fb --- /dev/null +++ b/.jenkins/Dependency @@ -0,0 +1,140 @@ +#!/usr/bin/env groovy +// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ +@Library('rocJenkins') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* + + +//////////////////////////////////////////////////////////////////////// +// Mostly generated from snippet generator 'properties; set job properties' +// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM +properties([ + pipelineTriggers([[$class: 'PeriodicFolderTrigger', interval: '1d']]), + buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), + disableConcurrentBuilds(), + [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] + ]) + +import java.nio.file.Path; + +rocBLASCI: +{ + + def rocblas = new rocProject('rocBLAS') + // customize for project + rocblas.paths.build_command = './install.sh -lasm_ci -c -b develop' + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(['gfx900 && ubuntu', 'gfx906 && centos7'], rocblas) + + boolean formatCheck = true + + def compileCommand = + { + platform, project-> + + project.paths.construct_build_prefix() + + def command + + if(platform.jenkinsLabel.contains('hip-clang')) + { + command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=/opt/rocm/bin/hipcc ${project.paths.build_command} --hip-clang + """ + } + else + { + command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=/opt/rocm/bin/hcc ${project.paths.build_command} + """ + } + platform.runCommand(this, command) + } + + def testCommand = + { + platform, project-> + + def command + + if(platform.jenkinsLabel.contains('centos')) + { + command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix}/build/release/clients/staging + LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG sudo ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*nightly*-*known_bug* #--gtest_filter=*nightly* + """ + + platform.runCommand(this, command) + junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" + } + else + { + command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix}/build/release/clients/staging + LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*nightly*-*known_bug* #--gtest_filter=*nightly* + """ + + platform.runCommand(this, command) + junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" + } + } + + def packageCommand = + { + platform, project-> + + def command + + if(platform.jenkinsLabel.contains('centos')) + { + command = """ + set -x + cd ${project.paths.project_build_prefix}/build/release + make package + mkdir -p package + mv *.rpm package/ + rpm -qlp package/*.rpm + """ + + platform.runCommand(this, command) + platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/release/package/*.rpm""") + } + else if(platform.jenkinsLabel.contains('hip-clang')) + { + packageCommand = null + } + else + { + command = """ + set -x + cd ${project.paths.project_build_prefix}/build/release + make package + make package_clients + mkdir -p package + mv *.deb package/ + mv clients/*.deb package/ + """ + + platform.runCommand(this, command) + platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/release/package/*.deb""") + } + } + + buildProject(rocblas, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) + +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 0710dd914..7c872ca3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,8 @@ cmake_minimum_required( VERSION 3.5 ) # We use C++14 features, this will add compile option: -std=c++14 set( CMAKE_CXX_STANDARD 14 ) +# Without this line, it will add -std=gnu++14 instead, which has some issues. +set( CMAKE_CXX_EXTENSIONS OFF ) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE @@ -33,6 +35,9 @@ endif() project( rocblas LANGUAGES CXX ) +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + # ######################################################################## # NOTE: CUDA compiling path # ######################################################################## @@ -135,7 +140,7 @@ include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) # Versioning via rocm-cmake -set ( VERSION_STRING "2.6.4" ) +set ( VERSION_STRING "2.8.0" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for convenience) @@ -171,21 +176,18 @@ if( BUILD_WITH_TENSILE ) set_property( CACHE Tensile_COMPILER PROPERTY STRINGS hcc hipcc) include(virtualenv) - if (Tensile_TEST_LOCAL_PATH) virtualenv_install(${Tensile_TEST_LOCAL_PATH}) message (STATUS "using local Tensile from ${Tensile_TEST_LOCAL_PATH}, copied to ${Tensile_ROOT}") else() # Use the virtual-env setup and download package from specified repot: set( tensile_fork "ROCmSoftwarePlatform" CACHE STRING "Tensile fork to use" ) - set( tensile_tag 1c58828cafd5f1285bf17b75ad5c04ca6c8fa88c CACHE STRING "Tensile tag to download" ) + set( tensile_tag 015477ad8c8ea0ef6f59b8d49a65015b46b8a48e CACHE STRING "Tensile tag to download" ) virtualenv_install("git+https://github.com/${tensile_fork}/Tensile.git@${tensile_tag}") message (STATUS "using GIT Tensile fork=${tensile_fork} from branch=${tensile_tag}") endif() list(APPEND CMAKE_PREFIX_PATH ${VIRTUALENV_HOME_DIR}) - set( Tensile_ROOT "${VIRTUALENV_HOME_DIR}/bin" CACHE STRING "Local path of Tensile" ) - set( Tensile_TensileConfig ${VIRTUALENV_HOME_DIR}/cmake/TensileConfig.cmake) - + find_package(Tensile 4.11.0 EXACT REQUIRED HIP LLVM OpenMP PATHS "${INSTALLED_TENSILE_PATH}") endif() # Find HCC/HIP dependencies diff --git a/Jenkinsfile b/Jenkinsfile index 5bef76620..f988ab19b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -32,7 +32,7 @@ rocBLASCI: rocblas.paths.build_command = './install.sh -lasm_ci -c' // Define test architectures, optional rocm version argument is available - def nodes = new dockerNodes(['gfx900 && ubuntu', 'gfx906 && centos7'], rocblas) + def nodes = new dockerNodes(['gfx900 && ubuntu', 'gfx906 && ubuntu', 'gfx900 && centos7', 'gfx906 && centos7'], rocblas) boolean formatCheck = true @@ -41,7 +41,7 @@ rocBLASCI: platform, project-> project.paths.construct_build_prefix() - + def command if(platform.jenkinsLabel.contains('hip-clang')) @@ -63,6 +63,8 @@ rocBLASCI: platform.runCommand(this, command) } + rocblas.timeout.test = 10 + def testCommand = { platform, project-> @@ -78,7 +80,7 @@ rocBLASCI: cd ${project.paths.project_build_prefix}/build/release/clients/staging LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG sudo ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*nightly*-*known_bug* #--gtest_filter=*nightly* """ - + platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } @@ -90,7 +92,7 @@ rocBLASCI: LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./example-sscal LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG sudo ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*quick*:*pre_checkin*-*known_bug* #--gtest_filter=*checkin* """ - + platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } @@ -104,7 +106,7 @@ rocBLASCI: cd ${project.paths.project_build_prefix}/build/release/clients/staging LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*nightly*-*known_bug* #--gtest_filter=*nightly* """ - + platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } @@ -116,7 +118,7 @@ rocBLASCI: LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./example-sscal LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*quick*:*pre_checkin*-*known_bug* #--gtest_filter=*checkin* """ - + platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } @@ -127,21 +129,21 @@ rocBLASCI: { platform, project-> - def command - + def command + if(platform.jenkinsLabel.contains('centos')) { command = """ set -x cd ${project.paths.project_build_prefix}/build/release make package - rm -rf package && mkdir -p package + mkdir -p package mv *.rpm package/ rpm -qlp package/*.rpm """ platform.runCommand(this, command) - platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/release/package/*.rpm""") + platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/release/package/*.rpm""") } else if(platform.jenkinsLabel.contains('hip-clang')) { @@ -153,9 +155,10 @@ rocBLASCI: set -x cd ${project.paths.project_build_prefix}/build/release make package - rm -rf package && mkdir -p package + make package_clients + mkdir -p package mv *.deb package/ - dpkg -c package/*.deb + mv clients/*.deb package/ """ platform.runCommand(this, command) diff --git a/bump_develop_version.sh b/bump_develop_version.sh index b1d7ab28b..557af9c80 100755 --- a/bump_develop_version.sh +++ b/bump_develop_version.sh @@ -5,11 +5,11 @@ # - run this script in master branch # - after running this script merge master into develop -OLD_ROCBLAS_VERSION="2.6.0" -NEW_ROCBLAS_VERSION="2.7.0" +OLD_ROCBLAS_VERSION="2.8.0" +NEW_ROCBLAS_VERSION="2.9.0" -OLD_TENSILE_VERSION="tensile_tag f5b33e22367807ca5bff1002b6e7e8939409d961" -NEW_TENSILE_VERSION="tensile_tag develop" +OLD_TENSILE_VERSION="tensile_tag 015477ad8c8ea0ef6f59b8d49a65015b46b8a48e" +NEW_TENSILE_VERSION="tensile_tag 015477ad8c8ea0ef6f59b8d49a65015b46b8a48e" sed -i "s/${OLD_ROCBLAS_VERSION}/${NEW_ROCBLAS_VERSION}/g" CMakeLists.txt sed -i "s/${OLD_TENSILE_VERSION}/${NEW_TENSILE_VERSION}/g" CMakeLists.txt diff --git a/bump_master_version.sh b/bump_master_version.sh index 3e6324de8..5f1029d24 100755 --- a/bump_master_version.sh +++ b/bump_master_version.sh @@ -6,11 +6,11 @@ # - after running this script and merging develop into master, run bump_develop_version.sh in master and # merge master into develop -OLD_ROCBLAS_VERSION="2.5.0" -NEW_ROCBLAS_VERSION="2.6.0" +OLD_ROCBLAS_VERSION="2.7.0" +NEW_ROCBLAS_VERSION="2.8.0" -OLD_TENSILE_VERSION="tensile_tag develop" -NEW_TENSILE_VERSION="tensile_tag f5b33e22367807ca5bff1002b6e7e8939409d961" +OLD_TENSILE_VERSION="tensile_tag 9c63a0bf1c0acdb44376ddc80b867beb3386981a" +NEW_TENSILE_VERSION="tensile_tag 015477ad8c8ea0ef6f59b8d49a65015b46b8a48e" OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.6.0" NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.7.1" diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 060453744..9d8ebcb94 100755 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -24,6 +24,15 @@ endif() # This project may compile dependencies for clients project( rocblas-clients LANGUAGES CXX ) +if(EXISTS /etc/redhat-release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") +endif() + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) include( build-options ) @@ -75,3 +84,46 @@ add_custom_command( OUTPUT "${ROCBLAS_GENTEST}" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ) add_custom_target( rocblas-common DEPENDS "${ROCBLAS_COMMON}" "${ROCBLAS_TEMPLATE}" "${ROCBLAS_GENTEST}" ) + + +# TODO: move to rocm-cmake +include(CMakeParseArguments) + +function(rocm_create_package_clients) + set(options) + set(oneValueArgs LIB_NAME DESCRIPTION SECTION MAINTAINER VERSION) + set(multiValueArgs DEPENDS) + + cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + string(CONCAT PACKAGE_NAME ${PARSE_LIB_NAME} "-clients-" ${PARSE_VERSION} "-Linux.deb") + string(CONCAT DEB_CONTROL_FILE_CONTENT "Package: " ${PARSE_LIB_NAME} "-clients" + "\nVersion: " ${PARSE_VERSION} + "\nSection: " ${PARSE_SECTION} + "\nPriority: optional" + "\nArchitecture: amd64" + "\nMaintainer: " ${PARSE_MAINTAINER} + "\nDescription: " ${PARSE_DESCRIPTION} + "\nDepends: " ${PARSE_LIB_NAME} "(>=" ${PARSE_VERSION} ")\n\n") + + if(EXISTS "${PROJECT_BINARY_DIR}/package") + file(REMOVE_RECURSE "${PROJECT_BINARY_DIR}/package") + endif() + file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/package/opt/rocm/${PARSE_LIB_NAME}/bin") + file(WRITE "${PROJECT_BINARY_DIR}/package/DEBIAN/control" ${DEB_CONTROL_FILE_CONTENT}) + + add_custom_target(package_clients + COMMAND ${CMAKE_COMMAND} -E remove -f "${PROJECT_BINARY_DIR}/package/opt/rocm/${PARSE_LIB_NAME}/bin/*" + COMMAND ${CMAKE_COMMAND} -E copy "${PROJECT_BINARY_DIR}/staging/*" "${PROJECT_BINARY_DIR}/package/opt/rocm/${PARSE_LIB_NAME}/bin" + COMMAND dpkg -b "${PROJECT_BINARY_DIR}/package/" ${PACKAGE_NAME}) +endfunction(rocm_create_package_clients) + + +if (BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_SELFTEST OR BUILD_CLIENTS_RIDER) + +rocm_create_package_clients(LIB_NAME rocblas + DESCRIPTION "Radeon Open Compute BLAS library" + MAINTAINER "rocblas-maintainer@amd.com>" + SECTION "dev" + VERSION ${rocblas_VERSION}) +endif() diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index 1b175009e..b27746611 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -8,6 +8,12 @@ set( Boost_DETAILED_FAILURE_MSG ON ) set( Boost_ADDITIONAL_VERSIONS 1.65.1 1.65 ) set( Boost_USE_STATIC_LIBS OFF ) +if(EXISTS /etc/redhat-release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") +endif() + find_package( Boost COMPONENTS program_options ) if( NOT Boost_FOUND ) @@ -20,6 +26,9 @@ if( NOT Boost_FOUND ) endif( ) endif( ) +set( THREADS_PREFER_PTHREAD_FLAG ON ) +find_package( Threads REQUIRED ) + # Linking lapack library requires fortran flags enable_language( Fortran ) find_package( cblas CONFIG REQUIRED ) @@ -27,15 +36,19 @@ if( NOT cblas_FOUND ) message( FATAL_ERROR "cblas is a required dependency and is not found; try adding cblas path to CMAKE_PREFIX_PATH" ) endif( ) +if(LINK_BLIS) + set( BLIS_CPP ../common/blis_interface.cpp ) +endif() + set( rocblas_benchmark_common ../common/utility.cpp ../common/cblas_interface.cpp - ../common/norm.cpp + ${BLIS_CPP} ../common/rocblas_parse_data.cpp ) add_executable( rocblas-bench client.cpp ${rocblas_benchmark_common} ) -target_compile_features( rocblas-bench PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type ) +target_compile_features( rocblas-bench PRIVATE cxx_static_assert cxx_nullptr cxx_auto_type) if( BUILD_WITH_TENSILE ) target_compile_definitions( rocblas-bench PRIVATE BUILD_WITH_TENSILE=1 ) @@ -49,16 +62,49 @@ target_include_directories( rocblas-bench $ ) -# External header includes included as system files -target_include_directories( rocblas-bench - SYSTEM PRIVATE - $ - $ - $ - $ +set( BLIS_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/build/deps/blis/include/blis ) +set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) + +if( EXISTS /etc/redhat-release) + set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) + set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + + # External header includes included as system files + target_include_directories( rocblas-bench + SYSTEM PRIVATE + $ + $ + $ + $ + $ + $ + $ + ) + if(LINK_BLIS) + target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} ${BLIS_LIBRARY} ${OPENMP_LIBRARY} cblas lapack roc::rocblas ) + else() + target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} ${OPENMP_LIBRARY} cblas lapack roc::rocblas ) + endif() + +else() + # External header includes included as system files + target_include_directories( rocblas-bench + SYSTEM PRIVATE + $ + $ + $ + $ + $ ) + + if(LINK_BLIS) + target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} ${BLIS_LIBRARY} cblas lapack roc::rocblas ) + else() + target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} cblas lapack roc::rocblas ) + endif() -target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} cblas lapack roc::rocblas ) +endif() get_target_property( HIPHCC_LOCATION hip::hip_hcc IMPORTED_LOCATION_RELEASE ) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 4ff1679e5..944ff95df 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -13,9 +13,15 @@ #include "testing_dot.hpp" #include "testing_geam.hpp" #include "testing_gemv.hpp" +#include "testing_gemv_batched.hpp" +#include "testing_gemv_strided_batched.hpp" #include "testing_ger.hpp" #include "testing_iamax_iamin.hpp" #include "testing_nrm2.hpp" +#include "testing_rot.hpp" +#include "testing_rotg.hpp" +#include "testing_rotm.hpp" +#include "testing_rotmg.hpp" #include "testing_scal.hpp" #include "testing_set_get_matrix.hpp" #include "testing_set_get_vector.hpp" @@ -48,7 +54,7 @@ using namespace std::literals; #include "testing_trsv.hpp" // Template to dispatch testing_gemm_ex for performance tests -// When Ti == void or complex, the test is marked invalid +// When Ti == void or Ti == To == Tc == bfloat16, the test is marked invalid template struct perf_gemm_ex : rocblas_test_invalid { @@ -58,7 +64,9 @@ template struct perf_gemm_ex{} && !is_complex>::type> + typename std::enable_if{} + && !(std::is_same{} && std::is_same{} + && std::is_same{})>::type> { explicit operator bool() { @@ -71,7 +79,7 @@ struct perf_gemm_ex struct perf_gemm_strided_batched_ex : rocblas_test_invalid { @@ -82,7 +90,9 @@ struct perf_gemm_strided_batched_ex< Ti, To, Tc, - typename std::enable_if{} && !is_complex>::type> + typename std::enable_if{} + && !(std::is_same{} && std::is_same{} + && std::is_same{})>::type> { explicit operator bool() { @@ -141,6 +151,10 @@ struct perf_blas< testing_nrm2(arg); else if(!strcmp(arg.function, "gemv")) testing_gemv(arg); + else if(!strcmp(arg.function, "gemv_batched")) + testing_gemv_batched(arg); + else if(!strcmp(arg.function, "gemv_strided_batched")) + testing_gemv_strided_batched(arg); else if(!strcmp(arg.function, "ger")) testing_ger(arg); else if(!strcmp(arg.function, "syr")) @@ -155,6 +169,31 @@ struct perf_blas< testing_set_get_vector(arg); else if(!strcmp(arg.function, "set_get_matrix")) testing_set_get_matrix(arg); + else if(!strcmp(arg.function, "rot")) + testing_rot(arg); + else if(!strcmp(arg.function, "rotg")) + testing_rotg(arg); + else if(!strcmp(arg.function, "rotm")) + testing_rotm(arg); + else if(!strcmp(arg.function, "rotmg")) + testing_rotmg(arg); + else + throw std::invalid_argument("Invalid combination --function "s + arg.function + + " --a_type "s + rocblas_datatype2string(arg.a_type)); + } +}; + +template +struct perf_blas{}>::type> +{ + explicit operator bool() + { + return true; + } + void operator()(const Arguments& arg) + { + if(!strcmp(arg.function, "dot")) + testing_dot(arg); else throw std::invalid_argument("Invalid combination --function "s + arg.function + " --a_type "s + rocblas_datatype2string(arg.a_type)); @@ -172,6 +211,8 @@ struct perf_blas{}>: { if(!strcmp(arg.function, "axpy")) testing_axpy(arg); + else if(!strcmp(arg.function, "dot")) + testing_dot(arg); else if(!strcmp(arg.function, "gemm")) testing_gemm(arg); else if(!strcmp(arg.function, "gemm_strided_batched")) @@ -194,7 +235,11 @@ struct perf_blas(arg); + else if(!strcmp(arg.function, "gemm_strided_batched")) + testing_gemm_strided_batched(arg); + else if(!strcmp(arg.function, "asum")) testing_asum(arg); else if(!strcmp(arg.function, "axpy")) testing_axpy(arg); @@ -500,6 +545,16 @@ try "Specific stride of strided_batched matrix D, is only applicable to strided batched" "BLAS_EX: second dimension * leading dimension.") + ("stride_x", + value(&arg.stride_x)->default_value(128*128), + "Specific stride of strided_batched vector x, is only applicable to strided batched" + "BLAS_2: second dimension.") + + ("stride_y", + value(&arg.stride_y)->default_value(128*128), + "Specific stride of strided_batched vector y, is only applicable to strided batched" + "BLAS_2: leading dimension.") + ("incx", value(&arg.incx)->default_value(1), "increment between values in x vector") @@ -518,7 +573,7 @@ try value(&arg.beta)->default_value(0.0), "specifies the scalar beta") ("betai", - value(&arg.beta)->default_value(0.0), "specifies the imaginary part of the scalar beta") + value(&arg.betai)->default_value(0.0), "specifies the imaginary part of the scalar beta") ("function,f", value(&function), diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp new file mode 100644 index 000000000..2f279f6db --- /dev/null +++ b/clients/common/blis_interface.cpp @@ -0,0 +1,9 @@ +#include "blis.h" +#include "omp.h" + +void setup_blis() +{ + bli_init(); +} + +static int initialize_blis = (setup_blis(), 0); diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp index 86a7fa3b7..74eb44ceb 100644 --- a/clients/common/cblas_interface.cpp +++ b/clients/common/cblas_interface.cpp @@ -39,6 +39,50 @@ void cblas_axpy(rocblas_int n, } } +template <> +void cblas_dot(rocblas_int n, + const rocblas_half* x, + rocblas_int incx, + const rocblas_half* y, + rocblas_int incy, + rocblas_half* result) +{ + size_t abs_incx = incx >= 0 ? incx : -incx; + size_t abs_incy = incy >= 0 ? incy : -incy; + host_vector x_float(n * abs_incx); + host_vector y_float(n * abs_incy); + + for(size_t i = 0; i < n; i++) + { + x_float[i * abs_incx] = half_to_float(x[i * abs_incx]); + y_float[i * abs_incy] = half_to_float(y[i * abs_incy]); + } + + *result = float_to_half(cblas_sdot(n, x_float, incx, y_float, incy)); +} + +template <> +void cblas_dot(rocblas_int n, + const rocblas_bfloat16* x, + rocblas_int incx, + const rocblas_bfloat16* y, + rocblas_int incy, + rocblas_bfloat16* result) +{ + size_t abs_incx = incx >= 0 ? incx : -incx; + size_t abs_incy = incy >= 0 ? incy : -incy; + host_vector x_float(n * abs_incx); + host_vector y_float(n * abs_incy); + + for(size_t i = 0; i < n; i++) + { + x_float[i * abs_incx] = float(x[i * abs_incx]); + y_float[i * abs_incy] = float(y[i * abs_incy]); + } + + *result = rocblas_bfloat16(cblas_sdot(n, x_float, incx, y_float, incy)); +} + /* * =========================================================================== * level 2 BLAS diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp deleted file mode 100644 index b350e6c2d..000000000 --- a/clients/common/norm.cpp +++ /dev/null @@ -1,596 +0,0 @@ -/* ************************************************************************ - * Copyright 2018-2019 Advanced Micro Devices, Inc. - * - * ************************************************************************ */ - -#include "norm.hpp" -#include "cblas.h" -#include "rocblas.h" -#include "rocblas_vector.hpp" -#include "utility.hpp" -#include -#include -#include - -/* ===================================================================== - README: Norm check: norm(A-B)/norm(A), evaluate relative error - Numerically, it is recommended by lapack. - - Call lapack fortran routines that do not exsit in cblas library. - No special header is required. But need to declare - function prototype - - All the functions are fortran and should append underscore (_) while - declaring prototype and calling. - xlange and xaxpy prototype are like following - =================================================================== */ - -extern "C" { -float slange_(char* norm_type, int* m, int* n, float* A, int* lda, float* work); -double dlange_(char* norm_type, int* m, int* n, double* A, int* lda, double* work); -float clange_(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work); -double zlange_(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work); - -float slansy_(char* norm_type, char* uplo, int* n, float* A, int* lda, float* work); -double dlansy_(char* norm_type, char* uplo, int* n, double* A, int* lda, double* work); -float clanhe_(char* norm_type, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* work); -double - zlanhe_(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work); - -void saxpy_(int* n, float* alpha, float* x, int* incx, float* y, int* incy); -void daxpy_(int* n, double* alpha, double* x, int* incx, double* y, int* incy); -void caxpy_( - int* n, float* alpha, rocblas_float_complex* x, int* incx, rocblas_float_complex* y, int* incy); -void zaxpy_(int* n, - double* alpha, - rocblas_double_complex* x, - int* incx, - rocblas_double_complex* y, - int* incy); -} - -/* ============================Norm Check for General Matrix: float/double/complex template - * speciliazation ======================================= */ - -/*! \brief compare the norm error of two matrices hCPU & hGPU */ -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_bfloat16* hCPU, - rocblas_bfloat16* hGPU) -{ - // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - double error_double = std::numeric_limits::quiet_NaN(); - - host_vector hCPU_float(N * lda), hGPU_float(N * lda); - for(rocblas_int i = 0; i < N * lda; i++) - { - hCPU_float[i] = float(hCPU[i]); - hGPU_float[i] = float(hGPU[i]); - } - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = slange_(&norm_type, &M, &N, hCPU_float, &lda, &work); - saxpy_(&size, &alpha, hCPU_float, &incx, hGPU_float, &incx); - - float error_float = slange_(&norm_type, &M, &N, hGPU_float, &lda, &work) / cpu_norm; - error_double = double(error_float); - - return error_double; -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_half* hCPU, - rocblas_half* hGPU) -{ - // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - double error_double = std::numeric_limits::quiet_NaN(); - - host_vector hCPU_float(N * lda), hGPU_float(N * lda); - for(rocblas_int i = 0; i < N * lda; i++) - { - hCPU_float[i] = half_to_float(hCPU[i]); - hGPU_float[i] = half_to_float(hGPU[i]); - } - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = slange_(&norm_type, &M, &N, hCPU_float, &lda, &work); - saxpy_(&size, &alpha, hCPU_float, &incx, hGPU_float, &incx); - - float error_float = slange_(&norm_type, &M, &N, hGPU_float, &lda, &work) / cpu_norm; - error_double = double(error_float); - - return error_double; -} - -template <> -double norm_check_general( - char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, float* hCPU, float* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = slange_(&norm_type, &M, &N, hCPU, &lda, &work); - saxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - float error = slange_(&norm_type, &M, &N, hGPU, &lda, &work) / cpu_norm; - - return (double)error; -} - -template <> -double norm_check_general( - char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, double* hCPU, double* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - double work[1]; - rocblas_int incx = 1; - double alpha = -1.0; - rocblas_int size = lda * N; - - double cpu_norm = dlange_(&norm_type, &M, &N, hCPU, &lda, work); - daxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - double error = dlange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm; - - return error; -} - -template <> -double norm_check_general( - char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, int32_t* hCPU, int32_t* hGPU) -{ - // Upconvert int32_t to double and call double version - host_vector hCPU_double(M * N), hGPU_double(M * N); - - for(int i = 0; i < M * N; i++) - { - hCPU_double[i] = double(hCPU[i]); - hGPU_double[i] = double(hGPU[i]); - } - return norm_check_general(norm_type, M, N, lda, hCPU_double, hGPU_double); -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_float_complex* hCPU, - rocblas_float_complex* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - float work[1]; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = clange_(&norm_type, &M, &N, hCPU, &lda, work); - caxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - float error = clange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm; - - return (double)error; -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_double_complex* hCPU, - rocblas_double_complex* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - - double work[1]; - rocblas_int incx = 1; - double alpha = -1.0; - rocblas_int size = lda * N; - - double cpu_norm = zlange_(&norm_type, &M, &N, hCPU, &lda, work); - zaxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - double error = zlange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm; - - return error; -} - -//=====Norm Check for strided_batched matrix -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_int stride_a, - rocblas_int batch_count, - rocblas_bfloat16* hCPU, - rocblas_bfloat16* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - // - // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm - // of strided batched matrix - - rocblas_int totalsize = N * lda + (batch_count - 1) * stride_a; - host_vector hCPU_float(totalsize), hGPU_float(totalsize); - for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++) - { - for(rocblas_int i = 0; i < N * lda; i++) - { - auto index = i + i_batch * stride_a; - hCPU_float[index] = float(hCPU[index]); - hGPU_float[index] = float(hGPU[index]); - } - } - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - double cumulative_error = 0.0; - - for(rocblas_int i = 0; i < batch_count; i++) - { - float cpu_norm = slange_(&norm_type, &M, &N, &hCPU_float[i * stride_a], &lda, &work); - - saxpy_(&size, &alpha, &hCPU_float[i * stride_a], &incx, &hGPU_float[i * stride_a], &incx); - - float error - = slange_(&norm_type, &M, &N, &hGPU_float[i * stride_a], &lda, &work) / cpu_norm; - - if(norm_type == 'F' || norm_type == 'f') - { - cumulative_error += error; - } - else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') - { - cumulative_error = cumulative_error > error ? cumulative_error : error; - } - } - - return cumulative_error; -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_int stride_a, - rocblas_int batch_count, - rocblas_half* hCPU, - rocblas_half* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - // - // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm - // of strided batched matrix - - rocblas_int totalsize = N * lda + (batch_count - 1) * stride_a; - host_vector hCPU_float(totalsize), hGPU_float(totalsize); - for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++) - { - for(rocblas_int i = 0; i < N * lda; i++) - { - auto index = i + i_batch * stride_a; - hCPU_float[index] = half_to_float(hCPU[index]); - hGPU_float[index] = half_to_float(hGPU[index]); - } - } - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - double cumulative_error = 0.0; - - for(rocblas_int i = 0; i < batch_count; i++) - { - float cpu_norm = slange_(&norm_type, &M, &N, &hCPU_float[i * stride_a], &lda, &work); - - saxpy_(&size, &alpha, &hCPU_float[i * stride_a], &incx, &hGPU_float[i * stride_a], &incx); - - float error - = slange_(&norm_type, &M, &N, &hGPU_float[i * stride_a], &lda, &work) / cpu_norm; - - if(norm_type == 'F' || norm_type == 'f') - { - cumulative_error += error; - } - else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') - { - cumulative_error = cumulative_error > error ? cumulative_error : error; - } - } - - return cumulative_error; -} - -//=====Norm Check for strided_batched matrix -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_int stride_a, - rocblas_int batch_count, - rocblas_int* hCPU, - rocblas_int* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - // - // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm - // of strided batched matrix - - rocblas_int totalsize = N * lda + (batch_count - 1) * stride_a; - host_vector hCPU_double(totalsize), hGPU_double(totalsize); - for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++) - { - for(rocblas_int i = 0; i < N * lda; i++) - { - auto index = i + i_batch * stride_a; - hCPU_double[index] = hCPU[index]; - hGPU_double[index] = hGPU[index]; - } - } - - double work; - rocblas_int incx = 1; - double alpha = -1.0f; - rocblas_int size = lda * N; - double cumulative_error = 0.0; - - for(rocblas_int i = 0; i < batch_count; i++) - { - double cpu_norm = dlange_(&norm_type, &M, &N, &hCPU_double[i * stride_a], &lda, &work); - - daxpy_(&size, &alpha, &hCPU_double[i * stride_a], &incx, &hGPU_double[i * stride_a], &incx); - - double error - = dlange_(&norm_type, &M, &N, &hGPU_double[i * stride_a], &lda, &work) / cpu_norm; - - if(norm_type == 'F' || norm_type == 'f') - { - cumulative_error += error; - } - else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') - { - cumulative_error = cumulative_error > error ? cumulative_error : error; - } - } - - return cumulative_error; -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_int stride_a, - rocblas_int batch_count, - float* hCPU, - float* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - // - // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm - // of strided batched matrix - - float work; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - double cumulative_error = 0.0; - - for(int i = 0; i < batch_count; i++) - { - float cpu_norm = slange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work); - - saxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx); - - float error = slange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm; - - if(norm_type == 'F' || norm_type == 'f') - { - cumulative_error += error; - } - else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') - { - cumulative_error = cumulative_error > error ? cumulative_error : error; - } - } - - return cumulative_error; -} - -template <> -double norm_check_general(char norm_type, - rocblas_int M, - rocblas_int N, - rocblas_int lda, - rocblas_int stride_a, - rocblas_int batch_count, - double* hCPU, - double* hGPU) -{ - // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm - // one norm is max column sum - // infinity norm is max row sum - // Frobenius is l2 norm of matrix entries - // - // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm - // of strided batched matrix - - double work; - rocblas_int incx = 1; - double alpha = -1.0f; - rocblas_int size = lda * N; - - double cumulative_error = 0.0; - - for(int i = 0; i < batch_count; i++) - { - double cpu_norm = dlange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work); - - daxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx); - - double error = dlange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm; - - if(norm_type == 'F' || norm_type == 'f') - { - cumulative_error += error; - } - else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') - { - cumulative_error = cumulative_error > error ? cumulative_error : error; - } - } - - return cumulative_error; -} - -/* ============================Norm Check for Symmetric Matrix: float/double/complex template - * speciliazation ======================================= */ - -/*! \brief compare the norm error of two hermitian/symmetric matrices hCPU & hGPU */ - -template <> -double norm_check_symmetric( - char norm_type, char uplo, rocblas_int N, rocblas_int lda, float* hCPU, float* hGPU) -{ - // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly - - float work[1]; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = slansy_(&norm_type, &uplo, &N, hCPU, &lda, work); - saxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - float error = slansy_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm; - - return (double)error; -} - -template <> -double norm_check_symmetric( - char norm_type, char uplo, rocblas_int N, rocblas_int lda, double* hCPU, double* hGPU) -{ - // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly - - double work[1]; - rocblas_int incx = 1; - double alpha = -1.0; - rocblas_int size = lda * N; - - double cpu_norm = dlansy_(&norm_type, &uplo, &N, hCPU, &lda, work); - daxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - double error = dlansy_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm; - - return error; -} - -template <> -double norm_check_symmetric(char norm_type, - char uplo, - rocblas_int N, - rocblas_int lda, - rocblas_float_complex* hCPU, - rocblas_float_complex* hGPU) -{ - // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly - - float work[1]; - rocblas_int incx = 1; - float alpha = -1.0f; - rocblas_int size = lda * N; - - float cpu_norm = clanhe_(&norm_type, &uplo, &N, hCPU, &lda, work); - caxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - float error = clanhe_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm; - - return (double)error; -} - -template <> -double norm_check_symmetric(char norm_type, - char uplo, - rocblas_int N, - rocblas_int lda, - rocblas_double_complex* hCPU, - rocblas_double_complex* hGPU) -{ - // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly - - double work[1]; - rocblas_int incx = 1; - double alpha = -1.0; - rocblas_int size = lda * N; - - double cpu_norm = zlanhe_(&norm_type, &uplo, &N, hCPU, &lda, work); - zaxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx); - - double error = zlanhe_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm; - - return error; -} diff --git a/clients/common/rocblas_gentest.py b/clients/common/rocblas_gentest.py index e2437e28b..f2b7ab4a1 100755 --- a/clients/common/rocblas_gentest.py +++ b/clients/common/rocblas_gentest.py @@ -195,6 +195,19 @@ def setdefaults(test): # Do not put constant defaults here -- use rocblas_common.yaml for that. # These are only for dynamic defaults # TODO: This should be ideally moved to YAML file, with eval'd expressions. + + if all([x in test for x in ('M', 'incx', 'strideScale')]) and test['function']=='ger_strided_batched': + test.setdefault('stride_x', int(test['M'] * abs(test['incx']) * + test['strideScale'])) + else: + test.setdefault('stride_x', 0) + + if all([x in test for x in ('N', 'incy', 'strideScale')]) and test['function']=='ger_strided_batched': + test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * + test['strideScale'])) + else: + test.setdefault('stride_y', 0) + if test['transA'] == '*' or test['transB'] == '*': test.setdefault('lda', 0) test.setdefault('ldb', 0) diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index a6c602bda..0ad325a4b 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -4,6 +4,7 @@ # For debugging, uncomment this # set( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g -O0" ) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") # set( Boost_DEBUG ON ) set( Boost_USE_MULTITHREADED ON ) @@ -61,11 +62,14 @@ set(rocblas_test_source trtri_gtest.cpp ) +if(LINK_BLIS) + set( BLIS_CPP ../common/blis_interface.cpp ) +endif() set( rocblas_benchmark_common ../common/utility.cpp ../common/cblas_interface.cpp - ../common/norm.cpp + ${BLIS_CPP} ../common/rocblas_parse_data.cpp ) @@ -91,9 +95,53 @@ target_include_directories( rocblas-test $ $ $ + $ ) -target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} cblas lapack roc::rocblas Threads::Threads ) +set( BLIS_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/build/deps/blis/include/blis ) +set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) + +if( EXISTS /etc/redhat-release) + set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) + set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + + # External header includes included as system files + target_include_directories( rocblas-test + SYSTEM PRIVATE + $ + $ + $ + $ + $ + $ + $ + $ + ) + + if(LINK_BLIS) + target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} ${BLIS_LIBRARY} ${OPENMP_LIBRARY} cblas lapack roc::rocblas ) + else() + target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} ${OPENMP_LIBRARY} cblas lapack roc::rocblas ) + endif() +else() + # External header includes included as system files + target_include_directories( rocblas-test + SYSTEM PRIVATE + $ + $ + $ + $ + $ + $ + ) + + if(LINK_BLIS) + target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} ${BLIS_LIBRARY} cblas lapack roc::rocblas ) + else() + target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} cblas lapack roc::rocblas ) + endif() +endif() get_target_property( HIPHCC_LOCATION hip::hip_hcc IMPORTED_LOCATION_RELEASE ) @@ -113,7 +161,7 @@ endif( ) set( ROCBLAS_TEST_DATA "${PROJECT_BINARY_DIR}/staging/rocblas_gtest.data") add_custom_command( OUTPUT "${ROCBLAS_TEST_DATA}" COMMAND ../common/rocblas_gentest.py -I ../include rocblas_gtest.yaml -o "${ROCBLAS_TEST_DATA}" - DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml + DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml gemv_batched_gtest.yaml gemv_strided_batched_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ) add_custom_target( rocblas-test-data DEPENDS "${ROCBLAS_TEST_DATA}" ) diff --git a/clients/gtest/blas1_gtest.cpp b/clients/gtest/blas1_gtest.cpp index 8eb96a432..8615f8dd1 100644 --- a/clients/gtest/blas1_gtest.cpp +++ b/clients/gtest/blas1_gtest.cpp @@ -9,6 +9,10 @@ #include "testing_dot.hpp" #include "testing_iamax_iamin.hpp" #include "testing_nrm2.hpp" +#include "testing_rot.hpp" +#include "testing_rotg.hpp" +#include "testing_rotm.hpp" +#include "testing_rotmg.hpp" #include "testing_scal.hpp" #include "testing_swap.hpp" #include "type_dispatch.hpp" @@ -28,6 +32,10 @@ namespace dotc, scal, swap, + rot, + rotg, + rotm, + rotmg, }; // ---------------------------------------------------------------------------- @@ -51,19 +59,29 @@ namespace RocBLAS_TestName name; name << rocblas_datatype2string(arg.a_type); - if(BLAS1 == blas1::scal && arg.a_type != arg.b_type) - name << '_' << rocblas_datatype2string(arg.b_type); + if(strstr(arg.function, "_bad_arg") != nullptr) + { + name << "_bad_arg"; + } + else + { + if((BLAS1 == blas1::scal || BLAS1 == blas1::rot || BLAS1 == blas1::rotg) + && arg.a_type != arg.b_type) + name << '_' << rocblas_datatype2string(arg.b_type); + if(BLAS1 == blas1::rot && arg.compute_type != arg.a_type) + name << '_' << rocblas_datatype2string(arg.compute_type); - name << '_' << arg.N; + name << '_' << arg.N; - if(BLAS1 == blas1::axpy || BLAS1 == blas1::scal) - name << '_' << arg.alpha << "_" << arg.alphai; + if(BLAS1 == blas1::axpy || BLAS1 == blas1::scal) + name << '_' << arg.alpha << "_" << arg.alphai; - name << '_' << arg.incx; + name << '_' << arg.incx; - if(BLAS1 == blas1::axpy || BLAS1 == blas1::copy || BLAS1 == blas1::dot - || BLAS1 == blas1::swap) - name << '_' << arg.incy; + if(BLAS1 == blas1::axpy || BLAS1 == blas1::copy || BLAS1 == blas1::dot + || BLAS1 == blas1::swap || BLAS1 == blas1::rot || BLAS1 == blas1::rotm) + name << '_' << arg.incy; + } return std::move(name); } @@ -83,7 +101,8 @@ namespace || std::is_same{})) || (BLAS1 == blas1::dot && std::is_same{} && std::is_same{} - && (std::is_same{} + && (std::is_same{} || std::is_same{} + || std::is_same{} || std::is_same{} || std::is_same{} || std::is_same{})) @@ -122,7 +141,32 @@ namespace || (BLAS1 == blas1::swap && std::is_same{} && std::is_same{} && (std::is_same{} || std::is_same{} || std::is_same{} - || std::is_same{}))>; + || std::is_same{})) + + || (BLAS1 == blas1::rot + && ((std::is_same{} && std::is_same{} && std::is_same{}) + || (std::is_same{} && std::is_same{} + && std::is_same{}) + || (std::is_same{} && std::is_same{} + && std::is_same{}) + || (std::is_same{} && std::is_same{} + && std::is_same{}) + || (std::is_same{} && std::is_same{} + && std::is_same{}) + || (std::is_same{} && std::is_same{} + && std::is_same{}))) + + || (BLAS1 == blas1::rotg && std::is_same{} + && ((std::is_same{} && std::is_same{}) + || (std::is_same{} && std::is_same{}) + || (std::is_same{} && std::is_same{}) + || (std::is_same{} && std::is_same{}))) + + || (BLAS1 == blas1::rotm && std::is_same{} && std::is_same{} + && (std::is_same{} || std::is_same{})) + + || (BLAS1 == blas1::rotmg && std::is_same{} && std::is_same{} + && (std::is_same{} || std::is_same{}))>; // Creates tests for one of the BLAS 1 functions // ARG passes 1-3 template arguments to the testing_* function @@ -184,6 +228,10 @@ BLAS1_TESTING(dot, ARG1) BLAS1_TESTING(dotc, ARG1) BLAS1_TESTING(scal, ARG2) BLAS1_TESTING(swap, ARG1) +BLAS1_TESTING(rot, ARG3) +BLAS1_TESTING(rotg, ARG2) +BLAS1_TESTING(rotm, ARG1) +BLAS1_TESTING(rotmg, ARG1) // clang-format on diff --git a/clients/gtest/blas1_gtest.yaml b/clients/gtest/blas1_gtest.yaml index 768cf0273..5c902dc01 100644 --- a/clients/gtest/blas1_gtest.yaml +++ b/clients/gtest/blas1_gtest.yaml @@ -34,11 +34,13 @@ Tests: # - iamin: *single_double_precisions_complex_real # broken for now -- cause unknown - axpy: *half_single_precisions_complex_real - copy: *single_double_precisions_complex_real - - dot: *single_double_precisions_complex_real + - dot: *half_bfloat_single_double_complex_real_precisions - dotc: *single_double_precisions_complex - scal: *single_double_precisions_complex_real - scal: *single_double_complex_real_in_complex_out - swap: *single_double_precisions_complex_real + - rot: *rot_precisions + - rotm: *single_double_precisions_complex_real - name: blas1 category: pre_checkin @@ -58,6 +60,8 @@ Tests: - scal: *single_double_precisions_complex_real - scal: *single_double_complex_real_in_complex_out - swap: *single_double_precisions_complex_real + - rot: *rot_precisions + - rotm: *single_double_precisions_complex_real - name: blas1_bad_arg category: pre_checkin @@ -68,9 +72,19 @@ Tests: - iamin_bad_arg: *single_double_precisions_complex_real - axpy_bad_arg: *half_single_precisions_complex_real - copy_bad_arg: *single_double_precisions_complex_real - - dot_bad_arg: *single_double_precisions_complex_real + - dot_bad_arg: *half_bfloat_single_double_complex_real_precisions - dotc_bad_arg: *single_double_precisions_complex - scal_bad_arg: *single_double_precisions_complex_real - scal_bad_arg: *single_double_complex_real_in_complex_out - swap_bad_arg: *single_double_precisions_complex_real + - rot_bad_arg: *rot_precisions + - rotg_bad_arg: *rotg_precisions + - rotm_bad_arg: *single_double_precisions_complex_real + - rotmg_bad_arg: *single_double_precisions_complex_real + +- name: blas1 + category: quick + function: + - rotg: *rotg_precisions + - rotmg: *single_double_precisions_complex_real ... diff --git a/clients/gtest/gemm_gtest.cpp b/clients/gtest/gemm_gtest.cpp index 5f6a6841f..5ad7f5390 100644 --- a/clients/gtest/gemm_gtest.cpp +++ b/clients/gtest/gemm_gtest.cpp @@ -112,12 +112,12 @@ namespace // When Ti = To = Tc != void, this test applies. // When converted to bool, this functor returns true. - // Complex is not supported yet. template struct gemm_testing{} && !is_complex>::type> + typename std::enable_if{} + && !std::is_same{}>::type> { explicit operator bool() { @@ -165,13 +165,14 @@ namespace // When Ti != void, this test applies. // When converted to bool, this functor returns true. - // Complex is not supported yet. template struct gemm_ex_testing< Ti, To, Tc, - typename std::enable_if{} && !is_complex>::type> + typename std::enable_if{} + && !(std::is_same{} && std::is_same{} + && std::is_same{})>::type> { explicit operator bool() { diff --git a/clients/gtest/gemm_gtest.yaml b/clients/gtest/gemm_gtest.yaml index bebd347b3..6b88c1018 100644 --- a/clients/gtest/gemm_gtest.yaml +++ b/clients/gtest/gemm_gtest.yaml @@ -155,6 +155,16 @@ Definitions: - { alpha: 1, beta: 3 } - { alpha: 1, beta: 1 } + - &complex_alpha_beta_range + - { alpha: 2, beta: 0, alphai: 0, betai: 0 } + - { alpha: 0, beta: 3, alphai: 0, betai: 0 } + - { alpha: 5, beta: 0, alphai: 0, betai: 5 } + - { alpha: -5, beta: 0, alphai: -5, betai: 0 } + - { alpha: 0, beta: 5, alphai: 0, betai: -5 } + - { alpha: 1, beta: 3, alphai: 3, betai: 1 } + - { alpha: -1, beta: -3, alphai: 3, betai: 1 } + - { alpha: 0, beta: 0, alphai: 2, betai: 1 } + - &transA_transB_range - { transA: N, transB: N } - { transA: N, transB: T } @@ -2345,15 +2355,27 @@ Tests: transA: N transB: N +- name: gemm_bad_arg + category: pre_checkin + function: + - gemm_bad_arg + - gemm_ex_bad_arg + - gemm_strided_batched_ex_bad_arg + precision: *single_double_precisions_complex + transA: N + transB: N + - name: gemm_NaN category: pre_checkin function: - gemm: *single_double_precisions # Half precision NaN doesn't work now - gemm_ex: *single_double_precisions + gemm: *single_double_precisions_complex_real # Half precision NaN doesn't work now + gemm_ex: *single_double_precisions_complex_real matrix_size: *medium_matrix_size_range transA_transB: *transA_transB_range alpha: [ 0.0, 1.0, -1.0, 2.0 ] + alphai: [ -1.0, 0.0, 1.0] beta: .NaN # converted to 0.0 in test code + betai: .NaN - name: gemm_small category: quick @@ -2364,6 +2386,15 @@ Tests: transA_transB: *transA_transB_range alpha_beta: *alpha_beta_range +- name: gemm_small_complex + category: quick + function: + gemm: *single_double_precisions_complex + gemm_ex: *single_double_precisions_complex + matrix_size: *small_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *complex_alpha_beta_range + - name: gemm_medium category: pre_checkin function: @@ -2373,6 +2404,15 @@ Tests: transA_transB: *transA_transB_range alpha_beta: *alpha_beta_range +- name: gemm_medium_complex + category: pre_checkin + function: + gemm: *single_double_precisions_complex + gemm_ex: *single_double_precisions_complex + matrix_size: *medium_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *alpha_beta_range + - name: gemm_large category: nightly function: @@ -2382,6 +2422,15 @@ Tests: transA_transB: *transA_transB_range alpha_beta: *alpha_beta_range +- name: gemm_large + category: nightly + function: + gemm: *single_double_precisions_complex + gemm_ex: *single_double_precisions_complex + matrix_size: *large_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *alpha_beta_range + - name: gemm_chunk category: pre_checkin function: @@ -2446,7 +2495,7 @@ Tests: - name: inception4_fwd category: nightly function: - gemmn: *half_single_double_precisions + gemm: *half_single_double_precisions transA: N transB: N matrix_size: *inception4_fwd_sizes diff --git a/clients/gtest/gemm_strided_batched_gtest.yaml b/clients/gtest/gemm_strided_batched_gtest.yaml index 08807c583..5032e88e4 100644 --- a/clients/gtest/gemm_strided_batched_gtest.yaml +++ b/clients/gtest/gemm_strided_batched_gtest.yaml @@ -223,6 +223,16 @@ Definitions: - { alpha: -2.0, beta: -3.0 } - { alpha: 0.0, beta: 1.0 } + - &complex_alpha_beta_range + - { alpha: 2, beta: 0, alphai: 0, betai: 0 } + - { alpha: 0, beta: 3, alphai: 0, betai: 0 } + - { alpha: 5, beta: 0, alphai: 0, betai: 5 } + - { alpha: -5, beta: 0, alphai: -5, betai: 0 } + - { alpha: 0, beta: 5, alphai: 0, betai: -5 } + - { alpha: 1, beta: 3, alphai: 3, betai: 1 } + - { alpha: -1, beta: -3, alphai: 3, betai: 1 } + - { alpha: 0, beta: 0, alphai: 2, betai: 1 } + Tests: - name: gemm_strided_batched_bad_arg category: pre_checkin @@ -231,15 +241,24 @@ Tests: transA: N transB: N +- name: gemm_strided_batched_bad_arg + category: pre_checkin + function: + - gemm_strided_batched_ex_bad_arg: *single_double_precisions_complex + transA: N + transB: N + - name: gemm_strided_batched_NaN category: pre_checkin function: - - gemm_strided_batched: *single_double_precisions - - gemm_strided_batched_ex: *single_double_precisions + - gemm_strided_batched: *single_double_precisions_complex_real + - gemm_strided_batched_ex: *single_double_precisions_complex_real matrix_size: *small_matrix_size_range transA_transB: *transA_transB_range alpha: [ -1.0, 0.0, 1.0, 2.0 ] + alphai: [ -1.0, 0.0, 1.0 ] beta: .NaN # converted to 0.0 in test code + betai: .NaN batch_count: [ 1, 3 ] # TODO: Add int8 precisions by replacing *hpa_half_single_double_precisions with *real_precisions @@ -254,6 +273,16 @@ Tests: transA_transB: *transA_transB_range batch_count: [ -1, 0, 1, 3 ] +- name: gemm_strided_batched_small_complex + category: quick + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *small_matrix_size_range + alpha_beta: *complex_alpha_beta_range + transA_transB: *transA_transB_range + batch_count: [ -1, 0, 1, 3 ] + - name: gemm_strided_batched_small_stride_zero category: quick function: @@ -261,7 +290,23 @@ Tests: gemm_strided_batched_ex: *real_precisions matrix_size: *small_matrix_size_stride_a_range alpha: 2.0 + alphai: 1.0 beta: 3.0 + betai: -1.0 + transA: N + transB: N + batch_count: [ 1, 3 ] + +- name: gemm_strided_batched_small_stride_zero_complex + category: quick + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *small_matrix_size_stride_a_range + alpha: 2.0 + alphai: 1.0 + beta: 3.0 + betai: -1.0 transA: N transB: N batch_count: [ 1, 3 ] @@ -276,6 +321,16 @@ Tests: alpha_beta: *alpha_beta_range batch_count: [ -1, 0, 1, 3, 63..65 ] +- name: gemm_strided_batched_medium_complex + category: pre_checkin + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *medium_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *complex_alpha_beta_range + batch_count: [ -1, 0, 1, 3, 63..65 ] + - name: gemm_strided_batched_medium_stride_zero category: nightly function: @@ -283,7 +338,23 @@ Tests: gemm_strided_batched_ex: *hpa_half_single_precisions matrix_size: *medium_matrix_size_stride_a_range alpha: 2.0 + alphai: 1.0 beta: 3.0 + betai: -1.0 + transA: N + transB: N + batch_count: 31..33 + +- name: gemm_strided_batched_medium_stride_zero_complex + category: nightly + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *medium_matrix_size_stride_a_range + alpha: 2.0 + alphai: 1.0 + beta: 3.0 + betai: -1.0 transA: N transB: N batch_count: 31..33 @@ -298,6 +369,16 @@ Tests: alpha_beta: *alpha_beta_range batch_count: [ -1, 0, 1, 3 ] +- name: gemm_strided_batched_large_complex + category: pre_checkin + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *large_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *alpha_beta_range + batch_count: [ -1, 0, 1, 3 ] + - name: gemm_strided_batched_large_stride_zero category: pre_checkin function: @@ -305,7 +386,23 @@ Tests: gemm_strided_batched_ex: *real_precisions matrix_size: *large_matrix_size_stride_a_range alpha: 2.0 + alphai: 1.0 + beta: 3.0 + betai: -1.0 + transA: N + transB: N + batch_count: [ -1, 0, 1, 3 ] + +- name: gemm_strided_batched_large_stride_zero_complex + category: pre_checkin + function: + gemm_strided_batched: *single_double_precisions_complex + gemm_strided_batched_ex: *single_double_precisions_complex + matrix_size: *large_matrix_size_stride_a_range + alpha: 2.0 + alphai: 1.0 beta: 3.0 + betai: -1.0 transA: N transB: N batch_count: [ -1, 0, 1, 3 ] diff --git a/clients/gtest/gemv_batched_gtest.yaml b/clients/gtest/gemv_batched_gtest.yaml new file mode 100644 index 000000000..e0eeedd3d --- /dev/null +++ b/clients/gtest/gemv_batched_gtest.yaml @@ -0,0 +1,91 @@ +--- +include: rocblas_common.yaml +include: known_bugs.yaml + +Definitions: + - &small_matrix_size_range + - { M: -1, N: 1, lda: 1 } + - { M: 1, N: -1, lda: 1 } + - { M: 1, N: 1, lda: 0 } + - { M: 10, N: 10, lda: 9 } + - { M: 0, N: 1, lda: 1 } + - { M: 1, N: 0, lda: 1 } + - { M: -1, N: -1, lda: -1 } + - { M: 10, N: 10, lda: 2 } + - { M: 100, N: 200, lda: 200 } + + - &medium_matrix_size_range + - { M: 300, N: 400, lda: 400 } + - { M: 600, N: 500, lda: 601 } + + - &large_matrix_size_range + - { M: 1000, N: 1000, lda: 1000 } + - { M: 2000, N: 2000, lda: 2000 } + - { M: 4011, N: 4011, lda: 4011 } + - { M: 8000, N: 8000, lda: 8000 } + + - &incx_incy_range + - { incx: 2, incy: 1 } + - { incx: -1, incy: 2 } + - { incx: 1, incy: 1 } + - { incx: -1, incy: 3 } + - { incx: 3, incy: -1 } + - { incx: 0, incy: 1 } + - { incx: 1, incy: 0 } + - { incx: 0, incy: -1 } + - { incx: 10, incy: 100 } + + - &alpha_beta_range + - { alpha: 2.0, beta: 0.0 } + - { alpha: -1.0, beta: -1.0 } + - { alpha: 2.0, beta: 1.0 } + - { alpha: 0.0, beta: 1.0 } + +Tests: +- name: gemv_batched_bad_arg + category: pre_checkin + function: gemv_batched_bad_arg + precision: *single_double_precisions + transA: N + +- name: gemv_batched_NaN + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha: [ -1.0, 0, 1.0, 2.0 ] + beta: .NaN # converted to 0.0 in test code + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_small + category: quick + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *small_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_medium + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions_complex_real + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] + +- name: gemv_batched_large + category: nightly + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *large_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] +... diff --git a/clients/gtest/gemv_gtest.cpp b/clients/gtest/gemv_gtest.cpp index ff86b426d..fd9823333 100644 --- a/clients/gtest/gemv_gtest.cpp +++ b/clients/gtest/gemv_gtest.cpp @@ -6,6 +6,8 @@ #include "rocblas_datatype2string.hpp" #include "rocblas_test.hpp" #include "testing_gemv.hpp" +#include "testing_gemv_batched.hpp" +#include "testing_gemv_strided_batched.hpp" #include "type_dispatch.hpp" #include #include @@ -13,6 +15,69 @@ namespace { + // possible gemv test cases + enum gemv_test_type + { + GEMV, + GEMV_BATCHED, + GEMV_STRIDED_BATCHED, + }; + + //gemv test template + template