From 3200b870f5e19bdb3a113647f52f7917bc652da7 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Wed, 2 Oct 2019 17:55:25 -0600 Subject: [PATCH 01/13] unified gemv template --- clients/common/rocblas_gentest.py | 16 + clients/gtest/gemv_strided_batched_gtest.yaml | 22 +- clients/gtest/ger_gtest.cpp | 32 +- clients/gtest/ger_gtest.yaml | 25 +- .../include/testing_gemv_strided_batched.hpp | 3 +- .../include/testing_ger_strided_batched.hpp | 3 +- library/src/blas2/gemv_device.hpp | 211 +++------ library/src/blas2/rocblas_gemv.cpp | 2 +- library/src/blas2/rocblas_gemv.hpp | 429 +++--------------- library/src/blas2/rocblas_gemv_batched.cpp | 4 +- .../blas2/rocblas_gemv_strided_batched.cpp | 30 +- library/src/blas2/rocblas_ger.cpp | 6 +- library/src/blas2/rocblas_ger_batched.cpp | 6 +- library/src/blas2/rocblas_ger_batched.hpp | 108 ----- .../src/blas2/rocblas_ger_strided_batched.cpp | 8 +- .../src/blas2/rocblas_ger_strided_batched.hpp | 121 ----- library/src/include/utility.h | 16 + 17 files changed, 198 insertions(+), 844 deletions(-) delete mode 100644 library/src/blas2/rocblas_ger_batched.hpp delete mode 100644 library/src/blas2/rocblas_ger_strided_batched.hpp diff --git a/clients/common/rocblas_gentest.py b/clients/common/rocblas_gentest.py index cffa95f1f..fe51d2211 100755 --- a/clients/common/rocblas_gentest.py +++ b/clients/common/rocblas_gentest.py @@ -214,6 +214,22 @@ def setdefaults(test): test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * test['stride_scale'])) + if test['function'] in ('gemv_strided_batched'): + if test['transA'] == 'N': + if all([x in test for x in ('N', 'incx', 'stride_scale')]): + test.setdefault('stride_x', int(test['N'] * abs(test['incx']) * + test['stride_scale'])) + if all([x in test for x in ('M', 'incy', 'stride_scale')]): + test.setdefault('stride_y', int(test['M'] * abs(test['incy']) * + test['stride_scale'])) + else: + if all([x in test for x in ('M', 'incx', 'stride_scale')]): + test.setdefault('stride_x', int(test['M'] * abs(test['incx']) * + test['stride_scale'])) + if all([x in test for x in ('N', 'incy', 'stride_scale')]): + test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * + test['stride_scale'])) + test.setdefault('stride_x', 0) test.setdefault('stride_y', 0) diff --git a/clients/gtest/gemv_strided_batched_gtest.yaml b/clients/gtest/gemv_strided_batched_gtest.yaml index 7a4ddbd2e..46a14601d 100644 --- a/clients/gtest/gemv_strided_batched_gtest.yaml +++ b/clients/gtest/gemv_strided_batched_gtest.yaml @@ -25,15 +25,15 @@ Definitions: - { M: 8000, N: 8000, lda: 8000, stride_a: 64000000 } - &incx_incy_range - - { incx: 2, incy: 1, stride_x: 8000, stride_y: 8000 } - - { incx: -1, incy: 2, stride_x: 8000, stride_y: 8000 } - - { incx: 1, incy: 1, stride_x: 8000, stride_y: 8000 } - - { incx: -1, incy: 3, stride_x: 4000, stride_y: 4000 } - - { incx: 3, incy: -1, stride_x: 2000, stride_y: 2000 } - - { incx: 0, incy: 1, stride_x: 1000, stride_y: 1000 } - - { incx: 1, incy: 0, stride_x: 1000, stride_y: 1000 } - - { incx: 0, incy: -1, stride_x: 1, stride_y: 1 } - - { incx: 10, incy: 100, stride_x: 8000, stride_y: 8000 } + - { incx: 2, incy: 1 } + - { incx: -1, incy: 2 } + - { incx: 1, incy: 1 } + - { incx: -1, incy: 3 } + - { incx: 3, incy: -1 } + - { incx: 0, incy: 1 } + - { incx: 1, incy: 0 } + - { incx: 0, incy: -1 } + - { incx: 10, incy: 100 } - &alpha_beta_range - { alpha: 2.0, beta: 0.0 } @@ -57,6 +57,7 @@ Tests: incx_incy: *incx_incy_range alpha: [ -1.0, 0, 1.0, 2.0 ] beta: .NaN # converted to 0.0 in test code + stride_scale: [ 1 ] batch_count: [ -1, 0, 1, 3 ] - name: gemv_strided_batched_small @@ -67,6 +68,7 @@ Tests: matrix_size: *small_matrix_size_range incx_incy: *incx_incy_range alpha_beta: *alpha_beta_range + stride_scale: [ 1 ] batch_count: [ -1, 0, 1, 3 ] - name: gemv_strided_batched_medium @@ -77,6 +79,7 @@ Tests: matrix_size: *medium_matrix_size_range incx_incy: *incx_incy_range alpha_beta: *alpha_beta_range + stride_scale: [ 1 ] batch_count: [ 3 ] - name: gemv_strided_batched_large @@ -87,5 +90,6 @@ Tests: matrix_size: *large_matrix_size_range incx_incy: *incx_incy_range alpha_beta: *alpha_beta_range + stride_scale: [ 1 ] batch_count: [ 3 ] ... diff --git a/clients/gtest/ger_gtest.cpp b/clients/gtest/ger_gtest.cpp index 6154bc04d..1b9f3294a 100644 --- a/clients/gtest/ger_gtest.cpp +++ b/clients/gtest/ger_gtest.cpp @@ -55,24 +55,32 @@ namespace { RocBLAS_TestName name; - name << rocblas_datatype2string(arg.a_type) << '_' << arg.M << '_' << arg.N << '_' - << arg.alpha << '_' << arg.incx; + name << rocblas_datatype2string(arg.a_type); - if(GER_TYPE == GER_STRIDED_BATCHED) - name << '_' << arg.stride_x; + if(strstr(arg.function, "_bad_arg") != nullptr) + { + name << "_bad_arg"; + } + else + { + name << '_' << arg.M << '_' << arg.N << '_' << arg.alpha << '_' << arg.incx; + + if(GER_TYPE == GER_STRIDED_BATCHED) + name << '_' << arg.stride_x; - name << '_' << arg.incy; + name << '_' << arg.incy; - if(GER_TYPE == GER_STRIDED_BATCHED) - name << '_' << arg.stride_y; + if(GER_TYPE == GER_STRIDED_BATCHED) + name << '_' << arg.stride_y; - name << '_' << arg.lda; + name << '_' << arg.lda; - if(GER_TYPE == GER_STRIDED_BATCHED) - name << '_' << arg.stride_a; + if(GER_TYPE == GER_STRIDED_BATCHED) + name << '_' << arg.stride_a; - if(GER_TYPE == GER_STRIDED_BATCHED || GER_TYPE == GER_BATCHED) - name << '_' << arg.batch_count; + if(GER_TYPE == GER_STRIDED_BATCHED || GER_TYPE == GER_BATCHED) + name << '_' << arg.batch_count; + } return std::move(name); } diff --git a/clients/gtest/ger_gtest.yaml b/clients/gtest/ger_gtest.yaml index a1ab5d422..7215ad2be 100644 --- a/clients/gtest/ger_gtest.yaml +++ b/clients/gtest/ger_gtest.yaml @@ -11,7 +11,7 @@ Definitions: - { M: 0, N: 1, lda: 1, stride_a: 1 } - { M: 1, N: 0, lda: 1, stride_a: 1 } - { M: 1, N: 1, lda: 0, stride_a: 1 } - - { M: 11, N: 12, lda: 13, stride_a: 1 } + - { M: 11, N: 12, lda: 13, stride_a: 156 } - { M: 16, N: 16, lda: 16, stride_a: 256 } - { M: 33, N: 32, lda: 33, stride_a: 1056 } - { M: 65, N: 65, lda: 66, stride_a: 4300 } @@ -46,7 +46,10 @@ Definitions: Tests: - name: ger_bad_arg category: pre_checkin - function: ger_bad_arg + function: + - ger_bad_arg + - ger_batched_bad_arg + - ger_strided_batched_bad_arg precision: *single_double_precisions - name: ger_small @@ -73,12 +76,6 @@ Tests: incx_incy: *incx_incy_range alpha: [ -0.5, 2.0, 0.0, 0.6 ] -- name: ger_batched_bad_arg - category: pre_checkin - function: ger_batched_bad_arg - precision: *single_double_precisions - batch_count: [ -5, 0, 1, 5, 10 ] - - name: ger_batched_small category: quick function: ger_batched @@ -106,13 +103,6 @@ Tests: alpha: [ -0.5, 2.0, 0.0 ] batch_count: [ 1, 3 ] -- name: ger_strided_batched_bad_arg - category: pre_checkin - function: ger_strided_batched_bad_arg - precision: *single_double_precisions - stride_scale: [ -1, 0, 0.5, 1, 2 ] - batch_count: [ -5, 0, 1, 5, 10 ] - - name: ger_strided_batched_small category: quick function: ger_strided_batched @@ -120,7 +110,7 @@ Tests: matrix_size: *small_matrix_size_range incx_incy: *incx_incy_range alpha: [ -0.5, 2.0, 0.0 ] - stride_scale: [ 0.5, 1, 2 ] + stride_scale: [ 1, 2 ] batch_count: [ -5, 0, 1, 5, 10 ] - name: ger_strided_batched_medium @@ -130,7 +120,7 @@ Tests: matrix_size: *medium_matrix_size_range incx_incy: *incx_incy_range alpha: [ -0.5, 2.0, 0.0 ] - stride_scale: [ 0.5, 1, 2 ] + stride_scale: [ 1, 2 ] batch_count: [ 1, 5, 10 ] - name: ger_strided_batched_large @@ -140,5 +130,6 @@ Tests: matrix_size: *large_matrix_size_range incx_incy: *nightly_incx_incy_range alpha: [ -0.5, 2.0, 0.0 ] + stride_scale: [ 1 ] batch_count: [ 1, 3 ] ... diff --git a/clients/include/testing_gemv_strided_batched.hpp b/clients/include/testing_gemv_strided_batched.hpp index 381888908..4cbe4d156 100644 --- a/clients/include/testing_gemv_strided_batched.hpp +++ b/clients/include/testing_gemv_strided_batched.hpp @@ -197,8 +197,7 @@ void testing_gemv_strided_batched(const Arguments& arg) size_y = dim_y * abs_incy; // argument sanity check before allocating invalid memory - if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || stride_a < size_A - || stride_x < size_x || stride_y < size_y || batch_count < 0) + if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || batch_count < 0) { static const size_t safe_size = 100; // arbitrarily set to 100 device_vector dA1(safe_size); diff --git a/clients/include/testing_ger_strided_batched.hpp b/clients/include/testing_ger_strided_batched.hpp index 235dda315..2a6d0c6ea 100644 --- a/clients/include/testing_ger_strided_batched.hpp +++ b/clients/include/testing_ger_strided_batched.hpp @@ -131,8 +131,7 @@ void testing_ger_strided_batched(const Arguments& arg) size_t size_y = N * abs_incy; // argument check before allocating invalid memory - if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || stride_a < size_A - || stride_x < size_x || stride_y < size_y || batch_count < 0) + if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || batch_count < 0) { static const size_t safe_size = 100; // arbitrarily set to 100 device_vector dA_1(safe_size); diff --git a/library/src/blas2/gemv_device.hpp b/library/src/blas2/gemv_device.hpp index d2b973e6f..4b77ba507 100644 --- a/library/src/blas2/gemv_device.hpp +++ b/library/src/blas2/gemv_device.hpp @@ -383,199 +383,96 @@ __device__ void gemvt_kernel(rocblas_int m, } } -template -__global__ void gemvn_kernel_strided(rocblas_int m, +template +__global__ void gemvn_kernel_test(rocblas_int m, rocblas_int n, U alpha_device_host, - const T* Aa, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, rocblas_int lda, - rocblas_int strideA, - const T* xa, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, U beta_device_host, - T* ya, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, rocblas_int incy, - rocblas_int stridey) + rocblas_stride stridey) { rocblas_int num_threads = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z; if(DIM_X * DIM_Y != num_threads) return; // need to launch exactly the same number of threads as template parameters indicate - const T* A; - const T* x; - T* y; - A = Aa + hipBlockIdx_y * strideA; - x = xa + hipBlockIdx_y * stridex; - y = ya + hipBlockIdx_y * stridey; + const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); + const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); + T* y = load_ptr_batch(ya, hipBlockIdx_y, shifty, stridey); - if(incx < 0) - x -= ptrdiff_t(incx) * (n - 1); - if(incy < 0) - y -= ptrdiff_t(incy) * (m - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); + auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); + auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); gemvn_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); } -template -__global__ void gemvc_kernel_strided(rocblas_int m, +template +__global__ void gemvc_kernel_test(rocblas_int m, rocblas_int n, U alpha_device_host, - const T* Aa, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, rocblas_int lda, - rocblas_int strideA, - const T* xa, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, U beta_device_host, - T* ya, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, rocblas_int incy, - rocblas_int stridey) + rocblas_stride stridey) { - const T* A; - const T* x; - T* y; - A = Aa + hipBlockIdx_y * strideA; - x = xa + hipBlockIdx_y * stridex; - y = ya + hipBlockIdx_y * stridey; + const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); + const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); + T* y = load_ptr_batch(ya, hipBlockIdx_y, shifty, stridey); - if(incx < 0) - x -= ptrdiff_t(incx) * (m - 1); - if(incy < 0) - y -= ptrdiff_t(incy) * (n - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); + auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); + auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); gemvc_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); } -template -__global__ void gemvt_kernel_strided(rocblas_int m, +template +__global__ void gemvt_kernel_test(rocblas_int m, rocblas_int n, U alpha_device_host, - const T* Aa, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, rocblas_int lda, - rocblas_int strideA, - const T* xa, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, U beta_device_host, - T* ya, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, rocblas_int incy, - rocblas_int stridey) -{ - const T* A; - const T* x; - T* y; - A = Aa + hipBlockIdx_y * strideA; - x = xa + hipBlockIdx_y * stridex; - y = ya + hipBlockIdx_y * stridey; - - if(incx < 0) - x -= ssize_t(incx) * (m - 1); - if(incy < 0) - y -= ssize_t(incy) * (n - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); - - gemvt_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -template -__global__ void gemvn_kernel_batched(rocblas_int m, - rocblas_int n, - U alpha_device_host, - const T* const Aa[], - rocblas_int lda, - const T* const xa[], - rocblas_int incx, - U beta_device_host, - T* const ya[], - rocblas_int incy) -{ - rocblas_int num_threads = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z; - if(DIM_X * DIM_Y != num_threads) - return; // need to launch exactly the same number of threads as template parameters indicate - - const T* A; - const T* x; - T* y; - A = Aa[hipBlockIdx_y]; - x = xa[hipBlockIdx_y]; - y = ya[hipBlockIdx_y]; - - if(incx < 0) - x -= ptrdiff_t(incx) * (n - 1); - if(incy < 0) - y -= ptrdiff_t(incy) * (m - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); - - gemvn_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -template -__global__ void gemvc_kernel_batched(rocblas_int m, - rocblas_int n, - U alpha_device_host, - const T* const Aa[], - rocblas_int lda, - const T* const xa[], - rocblas_int incx, - U beta_device_host, - T* const ya[], - rocblas_int incy) + rocblas_stride stridey) { - const T* A; - const T* x; - T* y; - A = Aa[hipBlockIdx_y]; - x = xa[hipBlockIdx_y]; - y = ya[hipBlockIdx_y]; + const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); + const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); + T* y = load_ptr_batch(ya, hipBlockIdx_y, shifty, stridey); - if(incx < 0) - x -= ptrdiff_t(incx) * (m - 1); - if(incy < 0) - y -= ptrdiff_t(incy) * (n - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); - - gemvc_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -template -__global__ void gemvt_kernel_batched(rocblas_int m, - rocblas_int n, - U alpha_device_host, - const T* const Aa[], - rocblas_int lda, - const T* const xa[], - rocblas_int incx, - U beta_device_host, - T* const ya[], - rocblas_int incy) -{ - const T* A; - const T* x; - T* y; - A = Aa[hipBlockIdx_y]; - x = xa[hipBlockIdx_y]; - y = ya[hipBlockIdx_y]; - - if(incx < 0) - x -= ssize_t(incx) * (m - 1); - if(incy < 0) - y -= ssize_t(incy) * (n - 1); - - auto alpha = load_scalar(alpha_device_host); - auto beta = load_scalar(beta_device_host); + auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); + auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); gemvt_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); } diff --git a/library/src/blas2/rocblas_gemv.cpp b/library/src/blas2/rocblas_gemv.cpp index 25aeaf9e4..82ce30b78 100644 --- a/library/src/blas2/rocblas_gemv.cpp +++ b/library/src/blas2/rocblas_gemv.cpp @@ -132,7 +132,7 @@ namespace if(m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy) return rocblas_status_invalid_size; - return rocblas_gemv_template(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy); + return rocblas_gemv_template(handle, transA, m, n, alpha, A, 0, lda, 0, x, 0, incx, 0, beta, y, 0, incy, 0, 1); } } // namespace diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp index a3b6561eb..d0ff6414e 100644 --- a/library/src/blas2/rocblas_gemv.hpp +++ b/library/src/blas2/rocblas_gemv.hpp @@ -7,19 +7,26 @@ #include "handle.h" #include "rocblas.h" -template +template rocblas_status rocblas_gemv_template(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n, - const T* alpha, - const T* A, + const U* alpha, + const V* A, + rocblas_int offseta, rocblas_int lda, - const T* x, + rocblas_stride strideA, + const V* x, + rocblas_int offsetx, rocblas_int incx, - const T* beta, - T* y, - rocblas_int incy) + rocblas_stride stridex, + const U* beta, + W* y, + rocblas_int offsety, + rocblas_int incy, + rocblas_stride stridey, + rocblas_int batch_count) { //quick return if(!m || !n) @@ -27,194 +34,9 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, hipStream_t rocblas_stream = handle->rocblas_stream; - if(transA == rocblas_operation_none) - { - // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s - static constexpr int GEMVN_DIM_X = 64; - static constexpr int GEMVN_DIM_Y = 16; - rocblas_int blocks = (m - 1) / (GEMVN_DIM_X * 4) + 1; - if(std::is_same{}) - blocks = (m - 1) / (GEMVN_DIM_X) + 1; - dim3 gemvn_grid(blocks, 1); - dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL((gemvn_kernel_strided), - gemvn_grid, - gemvn_threads, - 0, - rocblas_stream, - m, - n, - alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - beta, - y, - incy, - 0); // stridey = 0 - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL((gemvn_kernel_strided), - gemvn_grid, - gemvn_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - *beta, - y, - incy, - 0); // stridey = 0 - } - } - else if(transA == rocblas_operation_transpose) - { - // transpose - // number of columns on the y-dim of the grid - static constexpr int NB = 256; - dim3 gemvt_grid(n, 1); - dim3 gemvt_threads(NB); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL(gemvt_kernel_strided, - gemvt_grid, - gemvt_threads, - 0, - rocblas_stream, - m, - n, - alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - beta, - y, - incy, - 0); // stridey = 0 - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL(gemvt_kernel_strided, - gemvt_grid, - gemvt_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - *beta, - y, - incy, - 0); // stridey = 0 - } - } - else // conjugate transpose - { - // conjugate transpose - // number of columns on the y-dim of the grid - static constexpr int NB = 256; - dim3 gemvc_grid(n, 1); - dim3 gemvc_threads(NB); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL(gemvc_kernel_strided, - gemvc_grid, - gemvc_threads, - 0, - rocblas_stream, - m, - n, - alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - beta, - y, - incy, - 0); // stridey = 0 - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL(gemvc_kernel_strided, - gemvc_grid, - gemvc_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - 0, // strideA = 0 - x, - incx, - 0, // stridex = 0 - *beta, - y, - incy, - 0); // stridey = 0 - } - } - return rocblas_status_success; -} - -template -rocblas_status rocblas_gemv_batched_template(rocblas_handle handle, - rocblas_operation transA, - rocblas_int m, - rocblas_int n, - const T* alpha, - const T* const A[], - rocblas_int lda, - const T* const x[], - rocblas_int incx, - const T* beta, - T* const y[], - rocblas_int incy, - rocblas_int batch_count) -{ - // Quick return if possible. Not Argument error - if(!m || !n || !batch_count) - return rocblas_status_success; - - hipStream_t rocblas_stream = handle->rocblas_stream; + // in case of negative inc shift pointer to end of data for negative indexing tid*inc + auto shiftx = incx < 0 ? offsetx - ptrdiff_t(incx) * ((transA == rocblas_operation_none ? n : m) - 1) : offsetx; + auto shifty = incy < 0 ? offsety - ptrdiff_t(incy) * ((transA == rocblas_operation_none ? m : n) - 1) : offsety; if(transA == rocblas_operation_none) { @@ -224,13 +46,12 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle handle, rocblas_int blocks = (m - 1) / (GEMVN_DIM_X * 4) + 1; if(std::is_same{}) blocks = (m - 1) / (GEMVN_DIM_X) + 1; - dim3 gemvn_grid(blocks, batch_count); dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y); if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL((gemvn_kernel_batched), + hipLaunchKernelGGL((gemvn_kernel_test), gemvn_grid, gemvn_threads, 0, @@ -238,190 +59,19 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle handle, m, n, alpha, - A, - lda, - x, - incx, - beta, - y, - incy); - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL((gemvn_kernel_batched), - gemvn_grid, - gemvn_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - x, - incx, - *beta, - y, - incy); - } - } - else if(transA == rocblas_operation_transpose) - { - // transpose - // number of columns on the y-dim of the grid - static constexpr int NB = 256; - dim3 gemvt_grid(n, batch_count); - dim3 gemvt_threads(NB); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL(gemvt_kernel_batched, - gemvt_grid, - gemvt_threads, - 0, - rocblas_stream, - m, - n, - alpha, - A, - lda, - x, - incx, - beta, - y, - incy); - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL(gemvt_kernel_batched, - gemvt_grid, - gemvt_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - x, - incx, - *beta, - y, - incy); - } - } - else // conjugate transpose - { - // conjugate transpose - // number of columns on the y-dim of the grid - static constexpr int NB = 256; - dim3 gemvc_grid(n, batch_count); - dim3 gemvc_threads(NB); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL(gemvc_kernel_batched, - gemvc_grid, - gemvc_threads, - 0, - rocblas_stream, - m, - n, - alpha, - A, - lda, - x, - incx, - beta, - y, - incy); - } - else - { - if(!*alpha && *beta == 1) - return rocblas_status_success; - - hipLaunchKernelGGL(gemvc_kernel_batched, - gemvc_grid, - gemvc_threads, 0, - rocblas_stream, - m, - n, - *alpha, - A, - lda, - x, - incx, - *beta, - y, - incy); - } - } - - return rocblas_status_success; -} - -template -rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, - rocblas_operation transA, - rocblas_int m, - rocblas_int n, - const T* alpha, - const T* A, - rocblas_int lda, - rocblas_int strideA, - const T* x, - rocblas_int incx, - rocblas_int stridex, - const T* beta, - T* y, - rocblas_int incy, - rocblas_int stridey, - rocblas_int batch_count) -{ - // Quick return if possible. Not Argument error - if(!m || !n || !batch_count) - return rocblas_status_success; - - hipStream_t rocblas_stream = handle->rocblas_stream; - - if(transA == rocblas_operation_none) - { - // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s - static constexpr int GEMVN_DIM_X = 64; - static constexpr int GEMVN_DIM_Y = 16; - rocblas_int blocks = (m - 1) / (GEMVN_DIM_X * 4) + 1; - if(std::is_same{}) - blocks = (m - 1) / (GEMVN_DIM_X) + 1; - - dim3 gemvn_grid(blocks, batch_count); - dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - { - hipLaunchKernelGGL((gemvn_kernel_strided), - gemvn_grid, - gemvn_threads, - 0, - rocblas_stream, - m, - n, - alpha, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, beta, + 0, y, + shifty, incy, stridey); } @@ -430,7 +80,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL((gemvn_kernel_strided), + hipLaunchKernelGGL((gemvn_kernel_test), gemvn_grid, gemvn_threads, 0, @@ -438,14 +88,19 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, m, n, *alpha, + 0, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, *beta, + 0, y, + shifty, incy, stridey); } @@ -460,7 +115,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL(gemvt_kernel_strided, + hipLaunchKernelGGL((gemvt_kernel_test), gemvt_grid, gemvt_threads, 0, @@ -468,14 +123,19 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, m, n, alpha, + 0, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, beta, + 0, y, + shifty, incy, stridey); } @@ -484,7 +144,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL(gemvt_kernel_strided, + hipLaunchKernelGGL((gemvt_kernel_test), gemvt_grid, gemvt_threads, 0, @@ -492,14 +152,19 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, m, n, *alpha, + 0, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, *beta, + 0, y, + shifty, incy, stridey); } @@ -514,7 +179,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL(gemvc_kernel_strided, + hipLaunchKernelGGL((gemvc_kernel_test), gemvc_grid, gemvc_threads, 0, @@ -522,14 +187,19 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, m, n, alpha, + 0, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, beta, + 0, y, + shifty, incy, stridey); } @@ -538,7 +208,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL(gemvc_kernel_strided, + hipLaunchKernelGGL((gemvc_kernel_test), gemvc_grid, gemvc_threads, 0, @@ -546,14 +216,19 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle handle, m, n, *alpha, + 0, A, + offseta, lda, strideA, x, + shiftx, incx, stridex, *beta, + 0, y, + shifty, incy, stridey); } diff --git a/library/src/blas2/rocblas_gemv_batched.cpp b/library/src/blas2/rocblas_gemv_batched.cpp index c0af67b88..64e5ac7c1 100644 --- a/library/src/blas2/rocblas_gemv_batched.cpp +++ b/library/src/blas2/rocblas_gemv_batched.cpp @@ -138,8 +138,8 @@ namespace if(batch_count < 0) return rocblas_status_invalid_size; - return rocblas_gemv_batched_template( - handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count); + return rocblas_gemv_template( + handle, transA, m, n, alpha, A, 0, lda, 0, x, 0, incx, 0, beta, y, 0, incy, 0, batch_count); } } // namespace diff --git a/library/src/blas2/rocblas_gemv_strided_batched.cpp b/library/src/blas2/rocblas_gemv_strided_batched.cpp index 8e2b6fc30..93a76b6b1 100644 --- a/library/src/blas2/rocblas_gemv_strided_batched.cpp +++ b/library/src/blas2/rocblas_gemv_strided_batched.cpp @@ -156,47 +156,25 @@ namespace return rocblas_status_invalid_pointer; if(m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy) return rocblas_status_invalid_size; - if(strideA < lda * n) - return rocblas_status_invalid_size; if(batch_count < 0) return rocblas_status_invalid_size; - size_t size_x, dim_x, abs_incx; - size_t size_y, dim_y, abs_incy; - - if(transA == rocblas_operation_none) - { - dim_x = n; - dim_y = m; - } - else - { - dim_x = m; - dim_y = n; - } - - abs_incx = incx >= 0 ? incx : -incx; - abs_incy = incy >= 0 ? incy : -incy; - - size_x = dim_x * abs_incx; - size_y = dim_y * abs_incy; - - if(stridex < size_x || stridey < size_y) - return rocblas_status_invalid_size; - - return rocblas_gemv_strided_batched_template(handle, + return rocblas_gemv_template(handle, transA, m, n, alpha, A, + 0, lda, strideA, x, + 0, incx, stridex, beta, y, + 0, incy, stridey, batch_count); diff --git a/library/src/blas2/rocblas_ger.cpp b/library/src/blas2/rocblas_ger.cpp index 218d21419..815fc208c 100644 --- a/library/src/blas2/rocblas_ger.cpp +++ b/library/src/blas2/rocblas_ger.cpp @@ -4,7 +4,7 @@ #include "handle.h" #include "logging.h" #include "rocblas.h" -#include "rocblas_ger_strided_batched.hpp" +#include "rocblas_ger.hpp" #include "utility.h" namespace @@ -88,8 +88,8 @@ namespace if(!m || !n) return rocblas_status_success; - rocblas_ger_strided_batched_template( - handle, m, n, alpha, x, 0, incx, incx * m, y, 0, incy, incy * n, A, 0, lda, lda * n, 1); + rocblas_ger_template( + handle, m, n, alpha, 0, x, 0, incx, incx * m, y, 0, incy, incy * n, A, 0, lda, lda * n, 1); return rocblas_status_success; } diff --git a/library/src/blas2/rocblas_ger_batched.cpp b/library/src/blas2/rocblas_ger_batched.cpp index 8715638c6..fd158169b 100644 --- a/library/src/blas2/rocblas_ger_batched.cpp +++ b/library/src/blas2/rocblas_ger_batched.cpp @@ -1,7 +1,7 @@ /* ************************************************************************ * Copyright 2016-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ -#include "rocblas_ger_batched.hpp" +#include "rocblas_ger.hpp" #include "handle.h" #include "logging.h" #include "rocblas.h" @@ -115,8 +115,8 @@ namespace if(!m || !n || !batch_count) return rocblas_status_success; - rocblas_ger_batched_template( - handle, m, n, alpha, x, 0, incx, y, 0, incy, A, 0, lda, batch_count); + rocblas_ger_template( + handle, m, n, alpha, 0, x, 0, incx, 0, y, 0, incy, 0, A, 0, lda, 0, batch_count); return rocblas_status_success; } diff --git a/library/src/blas2/rocblas_ger_batched.hpp b/library/src/blas2/rocblas_ger_batched.hpp deleted file mode 100644 index 172238d7f..000000000 --- a/library/src/blas2/rocblas_ger_batched.hpp +++ /dev/null @@ -1,108 +0,0 @@ -/* ************************************************************************ - * Copyright 2016-2019 Advanced Micro Devices, Inc. - * ************************************************************************ */ -#include "handle.h" -#include "logging.h" -#include "rocblas.h" -#include "utility.h" - -template -__global__ void ger_batched_kernel(rocblas_int m, - rocblas_int n, - U alpha_device_host, - const T* const __restrict__ xa[], - rocblas_int shiftx, - rocblas_int incx, - const T* const __restrict__ ya[], - rocblas_int shifty, - rocblas_int incy, - T* const Aa[], - rocblas_int shiftA, - rocblas_int lda) -{ - ptrdiff_t tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - ptrdiff_t ty = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - - if(tx < m && ty < n) - { - auto alpha = load_scalar(alpha_device_host); - T* A; - const T* __restrict__ x; - const T* __restrict__ y; - A = Aa[hipBlockIdx_z] + shiftA; - x = xa[hipBlockIdx_z] + shiftx; - y = ya[hipBlockIdx_z] + shifty; - - if(incx < 0) - x -= ssize_t(incx) * (m - 1); - if(incy < 0) - y -= ssize_t(incy) * (n - 1); - - A[tx + lda * ty] += alpha * x[tx * incx] * y[ty * incy]; - } -} - -template -rocblas_status rocblas_ger_batched_template(rocblas_handle handle, - rocblas_int m, - rocblas_int n, - const T* alpha, - const T* const x[], - rocblas_int shiftx, - rocblas_int incx, - const T* const y[], - rocblas_int shifty, - rocblas_int incy, - T* const A[], - rocblas_int shiftA, - rocblas_int lda, - rocblas_int batch_count) -{ - hipStream_t rocblas_stream = handle->rocblas_stream; - - static constexpr int GEMV_DIM_X = 128; - static constexpr int GEMV_DIM_Y = 8; - rocblas_int blocksX = (m - 1) / GEMV_DIM_X + 1; - rocblas_int blocksY = (n - 1) / GEMV_DIM_Y + 1; - - dim3 ger_batched_grid(blocksX, blocksY, batch_count); - dim3 ger_batched_threads(GEMV_DIM_X, GEMV_DIM_Y); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - hipLaunchKernelGGL(ger_batched_kernel, - ger_batched_grid, - ger_batched_threads, - 0, - rocblas_stream, - m, - n, - alpha, - x, - shiftx, - incx, - y, - shifty, - incy, - A, - shiftA, - lda); - else - hipLaunchKernelGGL(ger_batched_kernel, - ger_batched_grid, - ger_batched_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - x, - shiftx, - incx, - y, - shifty, - incy, - A, - shiftA, - lda); - return rocblas_status_success; -} diff --git a/library/src/blas2/rocblas_ger_strided_batched.cpp b/library/src/blas2/rocblas_ger_strided_batched.cpp index 9343b24f2..d157e6a89 100644 --- a/library/src/blas2/rocblas_ger_strided_batched.cpp +++ b/library/src/blas2/rocblas_ger_strided_batched.cpp @@ -1,7 +1,7 @@ /* ************************************************************************ * Copyright 2016-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ -#include "rocblas_ger_strided_batched.hpp" +#include "rocblas_ger.hpp" #include "handle.h" #include "logging.h" #include "rocblas.h" @@ -129,18 +129,18 @@ namespace if(!x || !y || !A) return rocblas_status_invalid_pointer; - if(m < 0 || n < 0 || !incx || !incy || lda < m || lda < 1 || stridex < m * std::abs(incx) - || stridey < n * abs(incy) || strideA < lda * n || batch_count < 0) + if(m < 0 || n < 0 || !incx || !incy || lda < m || lda < 1 || batch_count < 0) return rocblas_status_invalid_size; // Quick return if possible. Not Argument error if(!m || !n || !batch_count) return rocblas_status_success; - rocblas_ger_strided_batched_template(handle, + rocblas_ger_template(handle, m, n, alpha, + 0, x, 0, incx, diff --git a/library/src/blas2/rocblas_ger_strided_batched.hpp b/library/src/blas2/rocblas_ger_strided_batched.hpp deleted file mode 100644 index 17defad70..000000000 --- a/library/src/blas2/rocblas_ger_strided_batched.hpp +++ /dev/null @@ -1,121 +0,0 @@ -/* ************************************************************************ - * Copyright 2016-2019 Advanced Micro Devices, Inc. - * ************************************************************************ */ -#include "handle.h" -#include "logging.h" -#include "rocblas.h" -#include "utility.h" - -template -__global__ void ger_strided_batched_kernel(rocblas_int m, - rocblas_int n, - U alpha_device_host, - const T* const __restrict__ xa, - rocblas_int shiftx, - rocblas_int incx, - rocblas_int stridex, - const T* const __restrict__ ya, - rocblas_int shifty, - rocblas_int incy, - rocblas_int stridey, - T* const Aa, - rocblas_int shiftA, - rocblas_int lda, - rocblas_int strideA) -{ - - ptrdiff_t tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - ptrdiff_t ty = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - - if(tx < m && ty < n) - { - auto alpha = load_scalar(alpha_device_host); - T* A; - const T* __restrict__ x; - const T* __restrict__ y; - A = Aa + hipBlockIdx_z * strideA + shiftA; - x = xa + hipBlockIdx_z * stridex + shiftx; - y = ya + hipBlockIdx_z * stridey + shifty; - - A[tx + lda * ty] += alpha * x[tx * incx] * y[ty * incy]; - } -} - -template -rocblas_status rocblas_ger_strided_batched_template(rocblas_handle handle, - rocblas_int m, - rocblas_int n, - const T* alpha, - const T* x, - rocblas_int shiftx, - rocblas_int incx, - rocblas_int stridex, - const T* y, - rocblas_int shifty, - rocblas_int incy, - rocblas_int stridey, - T* A, - rocblas_int shiftA, - rocblas_int lda, - rocblas_int strideA, - rocblas_int batch_count) -{ - hipStream_t rocblas_stream = handle->rocblas_stream; - - static constexpr int GEMV_DIM_X = 128; - static constexpr int GEMV_DIM_Y = 8; - rocblas_int blocksX = (m - 1) / GEMV_DIM_X + 1; - rocblas_int blocksY = (n - 1) / GEMV_DIM_Y + 1; - - dim3 ger_strided_batched_grid(blocksX, blocksY, batch_count); - dim3 ger_strided_batched_threads(GEMV_DIM_X, GEMV_DIM_Y); - - if(incx < 0) - x -= ptrdiff_t(incx) * (m - 1); - if(incy < 0) - y -= ptrdiff_t(incy) * (n - 1); - - if(handle->pointer_mode == rocblas_pointer_mode_device) - hipLaunchKernelGGL(ger_strided_batched_kernel, - ger_strided_batched_grid, - ger_strided_batched_threads, - 0, - rocblas_stream, - m, - n, - alpha, - x, - shiftx, - incx, - stridex, - y, - shifty, - incy, - stridey, - A, - shiftA, - lda, - strideA); - else - hipLaunchKernelGGL(ger_strided_batched_kernel, - ger_strided_batched_grid, - ger_strided_batched_threads, - 0, - rocblas_stream, - m, - n, - *alpha, - x, - shiftx, - incx, - stridex, - y, - shifty, - incy, - stridey, - A, - shiftA, - lda, - strideA); - return rocblas_status_success; -} diff --git a/library/src/include/utility.h b/library/src/include/utility.h index 0289da55e..c5a3d8a61 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -103,6 +103,22 @@ __forceinline__ __device__ __host__ T* return p[block] + offset; } +// For device pointers +template +__forceinline__ __device__ __host__ T* + load_ptr_batch(T* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) +{ + return p + block * stride + offset; +} + +// For device array of device pointers +template +__forceinline__ __device__ __host__ T* + load_ptr_batch(T* const* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) +{ + return p[block] + offset; +} + #endif // GOOGLE_TEST inline bool isAligned(const void* pointer, size_t byte_count) From a27210640c309271d1feaedb41370d005756e65f Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 11:42:09 -0600 Subject: [PATCH 02/13] format and fix scal batched bandwidth --- clients/include/testing_scal_batched.hpp | 2 +- .../include/testing_scal_strided_batched.hpp | 2 +- library/src/blas2/gemv_device.hpp | 108 +++++++++--------- library/src/blas2/rocblas_gemv.cpp | 3 +- library/src/blas2/rocblas_gemv.hpp | 14 ++- library/src/blas2/rocblas_gemv_batched.cpp | 21 +++- .../blas2/rocblas_gemv_strided_batched.cpp | 36 +++--- library/src/blas2/rocblas_ger.cpp | 22 +++- library/src/blas2/rocblas_ger_batched.cpp | 2 +- .../src/blas2/rocblas_ger_strided_batched.cpp | 36 +++--- 10 files changed, 142 insertions(+), 104 deletions(-) diff --git a/clients/include/testing_scal_batched.hpp b/clients/include/testing_scal_batched.hpp index 8704bde04..5a9ab5ad6 100644 --- a/clients/include/testing_scal_batched.hpp +++ b/clients/include/testing_scal_batched.hpp @@ -176,7 +176,7 @@ void testing_scal_batched(const Arguments& arg) gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; rocblas_gflops = batch_count * scal_gflop_count(N) / gpu_time_used * 1e6 * 1; - rocblas_bandwidth = (2.0 * N) * sizeof(T) / gpu_time_used / 1e3; + rocblas_bandwidth = batch_count * (2.0 * N) * sizeof(T) / gpu_time_used / 1e3; std::cout << "N,alpha,incx,rocblas-Gflops,rocblas-GB/s,rocblas-us"; diff --git a/clients/include/testing_scal_strided_batched.hpp b/clients/include/testing_scal_strided_batched.hpp index 2cc1d2fe6..5c070c204 100644 --- a/clients/include/testing_scal_strided_batched.hpp +++ b/clients/include/testing_scal_strided_batched.hpp @@ -162,7 +162,7 @@ void testing_scal_strided_batched(const Arguments& arg) gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; rocblas_gflops = batch_count * scal_gflop_count(N) / gpu_time_used * 1e6 * 1; - rocblas_bandwidth = (2.0 * N) * sizeof(T) / gpu_time_used / 1e3; + rocblas_bandwidth = batch_count * (2.0 * N) * sizeof(T) / gpu_time_used / 1e3; std::cout << "N,alpha,incx,rocblas-Gflops,rocblas-GB/s,rocblas-us"; diff --git a/library/src/blas2/gemv_device.hpp b/library/src/blas2/gemv_device.hpp index 4b77ba507..14a14bfcd 100644 --- a/library/src/blas2/gemv_device.hpp +++ b/library/src/blas2/gemv_device.hpp @@ -384,24 +384,24 @@ __device__ void gemvt_kernel(rocblas_int m, } template -__global__ void gemvn_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvn_kernel_test(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { rocblas_int num_threads = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z; if(DIM_X * DIM_Y != num_threads) @@ -418,24 +418,24 @@ __global__ void gemvn_kernel_test(rocblas_int m, } template -__global__ void gemvc_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvc_kernel_test(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); @@ -448,24 +448,24 @@ __global__ void gemvc_kernel_test(rocblas_int m, } template -__global__ void gemvt_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvt_kernel_test(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); diff --git a/library/src/blas2/rocblas_gemv.cpp b/library/src/blas2/rocblas_gemv.cpp index 82ce30b78..f214074ef 100644 --- a/library/src/blas2/rocblas_gemv.cpp +++ b/library/src/blas2/rocblas_gemv.cpp @@ -132,7 +132,8 @@ namespace if(m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy) return rocblas_status_invalid_size; - return rocblas_gemv_template(handle, transA, m, n, alpha, A, 0, lda, 0, x, 0, incx, 0, beta, y, 0, incy, 0, 1); + return rocblas_gemv_template( + handle, transA, m, n, alpha, A, 0, lda, 0, x, 0, incx, 0, beta, y, 0, incy, 0, 1); } } // namespace diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp index d0ff6414e..0f91bff4b 100644 --- a/library/src/blas2/rocblas_gemv.hpp +++ b/library/src/blas2/rocblas_gemv.hpp @@ -16,16 +16,16 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, const V* A, rocblas_int offseta, rocblas_int lda, - rocblas_stride strideA, + rocblas_stride strideA, const V* x, rocblas_int offsetx, rocblas_int incx, - rocblas_stride stridex, + rocblas_stride stridex, const U* beta, W* y, rocblas_int offsety, rocblas_int incy, - rocblas_stride stridey, + rocblas_stride stridey, rocblas_int batch_count) { //quick return @@ -35,8 +35,12 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, hipStream_t rocblas_stream = handle->rocblas_stream; // in case of negative inc shift pointer to end of data for negative indexing tid*inc - auto shiftx = incx < 0 ? offsetx - ptrdiff_t(incx) * ((transA == rocblas_operation_none ? n : m) - 1) : offsetx; - auto shifty = incy < 0 ? offsety - ptrdiff_t(incy) * ((transA == rocblas_operation_none ? m : n) - 1) : offsety; + auto shiftx = incx < 0 + ? offsetx - ptrdiff_t(incx) * ((transA == rocblas_operation_none ? n : m) - 1) + : offsetx; + auto shifty = incy < 0 + ? offsety - ptrdiff_t(incy) * ((transA == rocblas_operation_none ? m : n) - 1) + : offsety; if(transA == rocblas_operation_none) { diff --git a/library/src/blas2/rocblas_gemv_batched.cpp b/library/src/blas2/rocblas_gemv_batched.cpp index 64e5ac7c1..0ba168d74 100644 --- a/library/src/blas2/rocblas_gemv_batched.cpp +++ b/library/src/blas2/rocblas_gemv_batched.cpp @@ -138,8 +138,25 @@ namespace if(batch_count < 0) return rocblas_status_invalid_size; - return rocblas_gemv_template( - handle, transA, m, n, alpha, A, 0, lda, 0, x, 0, incx, 0, beta, y, 0, incy, 0, batch_count); + return rocblas_gemv_template(handle, + transA, + m, + n, + alpha, + A, + 0, + lda, + 0, + x, + 0, + incx, + 0, + beta, + y, + 0, + incy, + 0, + batch_count); } } // namespace diff --git a/library/src/blas2/rocblas_gemv_strided_batched.cpp b/library/src/blas2/rocblas_gemv_strided_batched.cpp index 93a76b6b1..f7d46f558 100644 --- a/library/src/blas2/rocblas_gemv_strided_batched.cpp +++ b/library/src/blas2/rocblas_gemv_strided_batched.cpp @@ -160,24 +160,24 @@ namespace return rocblas_status_invalid_size; return rocblas_gemv_template(handle, - transA, - m, - n, - alpha, - A, - 0, - lda, - strideA, - x, - 0, - incx, - stridex, - beta, - y, - 0, - incy, - stridey, - batch_count); + transA, + m, + n, + alpha, + A, + 0, + lda, + strideA, + x, + 0, + incx, + stridex, + beta, + y, + 0, + incy, + stridey, + batch_count); } } //namespace diff --git a/library/src/blas2/rocblas_ger.cpp b/library/src/blas2/rocblas_ger.cpp index 815fc208c..8f7f76320 100644 --- a/library/src/blas2/rocblas_ger.cpp +++ b/library/src/blas2/rocblas_ger.cpp @@ -1,10 +1,10 @@ /* ************************************************************************ * Copyright 2016-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ +#include "rocblas_ger.hpp" #include "handle.h" #include "logging.h" #include "rocblas.h" -#include "rocblas_ger.hpp" #include "utility.h" namespace @@ -88,8 +88,24 @@ namespace if(!m || !n) return rocblas_status_success; - rocblas_ger_template( - handle, m, n, alpha, 0, x, 0, incx, incx * m, y, 0, incy, incy * n, A, 0, lda, lda * n, 1); + rocblas_ger_template(handle, + m, + n, + alpha, + 0, + x, + 0, + incx, + incx * m, + y, + 0, + incy, + incy * n, + A, + 0, + lda, + lda * n, + 1); return rocblas_status_success; } diff --git a/library/src/blas2/rocblas_ger_batched.cpp b/library/src/blas2/rocblas_ger_batched.cpp index fd158169b..60bafbc8e 100644 --- a/library/src/blas2/rocblas_ger_batched.cpp +++ b/library/src/blas2/rocblas_ger_batched.cpp @@ -1,10 +1,10 @@ /* ************************************************************************ * Copyright 2016-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ -#include "rocblas_ger.hpp" #include "handle.h" #include "logging.h" #include "rocblas.h" +#include "rocblas_ger.hpp" #include "utility.h" namespace diff --git a/library/src/blas2/rocblas_ger_strided_batched.cpp b/library/src/blas2/rocblas_ger_strided_batched.cpp index d157e6a89..163eebe3e 100644 --- a/library/src/blas2/rocblas_ger_strided_batched.cpp +++ b/library/src/blas2/rocblas_ger_strided_batched.cpp @@ -1,10 +1,10 @@ /* ************************************************************************ * Copyright 2016-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ -#include "rocblas_ger.hpp" #include "handle.h" #include "logging.h" #include "rocblas.h" +#include "rocblas_ger.hpp" #include "utility.h" namespace @@ -137,23 +137,23 @@ namespace return rocblas_status_success; rocblas_ger_template(handle, - m, - n, - alpha, - 0, - x, - 0, - incx, - stridex, - y, - 0, - incy, - stridey, - A, - 0, - lda, - strideA, - batch_count); + m, + n, + alpha, + 0, + x, + 0, + incx, + stridex, + y, + 0, + incy, + stridey, + A, + 0, + lda, + strideA, + batch_count); return rocblas_status_success; } From 8a361a23af4c3deb72c7a0bb65933ccfa58e210a Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 15:01:32 -0600 Subject: [PATCH 03/13] Updating stride types in Doc --- library/include/rocblas-functions.h | 73 ++++++++++++++++------------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/library/include/rocblas-functions.h b/library/include/rocblas-functions.h index 0960f6c98..cd5010c3a 100644 --- a/library/include/rocblas-functions.h +++ b/library/include/rocblas-functions.h @@ -190,12 +190,14 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_batched(rocblas_handle @param[inout] x pointer storing vector x on the GPU. @param[in] - incx specifies the increment for the elements of x. + incx rocblas_int + specifies the increment for the elements of x. @param[in] - stride_x stride form the start of one vector (x_i) and the next one (x_i+1). - There are no restrictions placed on stride_x, however the user should - take care to ensure that stride_x is of appropriate size, for a typical - case this means stride_x > n * incx. + stride_x rocblas_stride + stride form the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x > n * incx. @param[in] batch_count specifies the number of batches in x. ********************************************************************/ @@ -652,16 +654,19 @@ ROCBLAS_EXPORT rocblas_status rocblas_zswap_batched(rocblas_handle hand @param[inout] x a pointer to the first vector x_i on the GPU. @param[in] - incx specifies the increment for the elements of x. + incx rocblas_int + specifies the increment for the elements of x. @param[in] - stridex specifies the pointer increment between batches for x. + stridex rocblas_stride + specifies the pointer increment between batches for x. @param[inout] y a pointer to the first vector y_i on the GPU. @param[in] incy rocblas_int specifies the increment for the elements of y. @param[in] - stridey specifies the pointer increment between batches for y. + stridey rocblas_stride + specifies the pointer increment between batches for y. @param[in] batch_count rocblas_int number of instances in the batch @@ -889,7 +894,8 @@ ROCBLAS_EXPORT rocblas_status rocblas_dzasum_batched(rocblas_handle incx rocblas_int specifies the increment for the elements of each x_i. incx must be > 0. @param[in] - stridex specifies the pointer increment between batches for x. stridex must be be non zero. + stridex rocblas_stride + specifies the pointer increment between batches for x. stridex must be be non zero. @param[out] results pointer to array for storing contiguous batch_count results. either on the host CPU or device GPU. @@ -1047,7 +1053,8 @@ ROCBLAS_EXPORT rocblas_status rocblas_dznrm2_batched(rocblas_handle incx rocblas_int specifies the increment for the elements of each x_i. incx must be > 0. @param[in] - stridex specifies the pointer increment between batches for x. stridex must be non zero. + stridex rocblas_stride + specifies the pointer increment between batches for x. stridex must be non zero. @param[in] batch_count rocblas_int number of instances in the batch @@ -1630,7 +1637,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle lda rocblas_int specifies the leading dimension of matrices A_i. @param[in] - strideA rocblas_int + strideA rocblas_stride stride from the start of one matrix (A_i) and the next one (A_i+1) @param[in] x pointer to the first vector (x_0) in the batch stored on the GPU. @@ -1638,7 +1645,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle incx rocblas_int specifies the increment for the elements of vectors x_i. @param[in] - stridex rocblas_int + stridex rocblas_stride stride form the start of one vector (x_i) and the next one (x_i+1) @param[in] beta specifies the scalar beta. @@ -1648,7 +1655,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle incy rocblas_int specifies the increment for the elements of vectors y_i. @param[in] - stridey rocblas_int + stridey rocblas_stride stride from the start of one vector (y_i) and the next one (y_i+1) @param[in] batch_count rocblas_int @@ -1772,7 +1779,8 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_strided_batched(rocblas_handle x pointer storing vector x on the GPU. @param[in] - incx specifies the increment for the elements of x. + incx rocblas_int + specifies the increment for the elements of x. ********************************************************************/ ROCBLAS_EXPORT rocblas_status rocblas_strsv(rocblas_handle handle, @@ -1824,7 +1832,8 @@ ROCBLAS_EXPORT rocblas_status rocblas_dtrsv(rocblas_handle handle, @param[in] x pointer storing vector x on the GPU. @param[in] - incx specifies the increment for the elements of x. + incx rocblas_int + specifies the increment for the elements of x. @param[in] beta specifies the scalar beta. @param[out] @@ -2031,23 +2040,23 @@ ROCBLAS_EXPORT rocblas_status rocblas_dger_batched(rocblas_handle handle, incx rocblas_int specifies the increments for the elements of vectors x_i. @param[in] - stridex rocblas_int - stride form the start of one vector (x_i) and the next one (x_i+1) + stridex rocblas_stride + stride form the start of one vector (x_i) and the next one (x_i+1) @param[in] y pointer to the first vector (y_0) in the batch stored on the GPU. @param[in] incy rocblas_int specifies the increment for the elements of vectors y_i. @param[in] - stridey rocblas_int - stride from the start of one vector (y_i) and the next one (y_i+1) + stridey rocblas_stride + stride from the start of one vector (y_i) and the next one (y_i+1) @param[inout] A pointer to the first matrix (A_0) in the batch stored on the GPU. @param[in] lda rocblas_int specifies the leading dimension of A. @param[in] - strideA rocblas_int + strideA rocblas_stride stride from the start of one matrix (A_i) and the next one (A_i+1) @param[in] batch_count rocblas_int @@ -2252,7 +2261,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_dtrtri(rocblas_handle handle, lda rocblas_int specifies the leading dimension of A. @param[in] - stride_a rocblas_int + stride_a rocblas_stride "batch stride a": stride from the start of one "A" matrix to the next @param[out] invA pointer storing the inverse matrix A on the GPU. @@ -2267,11 +2276,11 @@ ROCBLAS_EXPORT rocblas_status rocblas_dtrtri(rocblas_handle handle, ldinvA rocblas_int specifies the leading dimension of invA. @param[in] - stride_invA rocblas_int - "batch stride invA": stride from the start of one "invA" matrix to the next + stride_invA rocblas_stride + "batch stride invA": stride from the start of one "invA" matrix to the next @param[in] - batch_count rocblas_int - numbers of matrices in the batch + batch_count rocblas_int + numbers of matrices in the batch ********************************************************************/ ROCBLAS_EXPORT rocblas_status rocblas_strtri_batched(rocblas_handle handle, @@ -2743,7 +2752,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemm_batched(rocblas_handle lda rocblas_int specifies the leading dimension of "A". @param[in] - stride_a rocblas_stride + stride_a rocblas_stride stride from the start of one "A" matrix to the next @param[in] B pointer storing strided batched matrix B on the GPU. @@ -2751,7 +2760,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemm_batched(rocblas_handle ldb rocblas_int specifies the leading dimension of "B". @param[in] - stride_b rocblas_stride + stride_b rocblas_stride stride from the start of one "B" matrix to the next @param[in] beta specifies the scalar beta. @@ -2761,7 +2770,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemm_batched(rocblas_handle ldc rocblas_int specifies the leading dimension of "C". @param[in] - stride_c rocblas_stride + stride_c rocblas_stride stride from the start of one "C" matrix to the next @param[in] batch_count @@ -3545,7 +3554,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_gemm_batched_ex(rocblas_handle handle, lda rocblas_int. specifies the leading dimension of A. @param[in] - stride_a rocblas_long. + stride_a rocblas_stride. specifies stride from start of one "A" matrix to the next. @param[in] b void *. @@ -3557,7 +3566,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_gemm_batched_ex(rocblas_handle handle, ldb rocblas_int. specifies the leading dimension of B. @param[in] - stride_b rocblas_long. + stride_b rocblas_stride. specifies stride from start of one "B" matrix to the next. @param[in] beta const void *. @@ -3572,7 +3581,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_gemm_batched_ex(rocblas_handle handle, ldc rocblas_int. specifies the leading dimension of C. @param[in] - stride_c rocblas_long. + stride_c rocblas_stride. specifies stride from start of one "C" matrix to the next. @param[out] d void *. @@ -3584,7 +3593,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_gemm_batched_ex(rocblas_handle handle, ldd rocblas_int. specifies the leading dimension of D. @param[in] - stride_d rocblas_long. + stride_d rocblas_stride. specifies stride from start of one "D" matrix to the next. @param[in] batch_count From 8014f6e624b244e401ae2260f88b63e5c39e214d Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 15:34:40 -0600 Subject: [PATCH 04/13] unify gemv yaml files --- clients/gtest/CMakeLists.txt | 2 +- clients/gtest/gemv_batched_gtest.yaml | 91 ----------- clients/gtest/gemv_gtest.yaml | 142 +++++++++++++++--- clients/gtest/gemv_strided_batched_gtest.yaml | 95 ------------ clients/gtest/rocblas_gtest.yaml | 2 - 5 files changed, 119 insertions(+), 213 deletions(-) delete mode 100644 clients/gtest/gemv_batched_gtest.yaml delete mode 100644 clients/gtest/gemv_strided_batched_gtest.yaml diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 4762e744f..632d57cc2 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -178,7 +178,7 @@ endif( ) set( ROCBLAS_TEST_DATA "${PROJECT_BINARY_DIR}/staging/rocblas_gtest.data") add_custom_command( OUTPUT "${ROCBLAS_TEST_DATA}" COMMAND ../common/rocblas_gentest.py -I ../include rocblas_gtest.yaml -o "${ROCBLAS_TEST_DATA}" - DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_batched_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml gemv_batched_gtest.yaml gemv_strided_batched_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml + DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_batched_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ) add_custom_target( rocblas-test-data DEPENDS "${ROCBLAS_TEST_DATA}" ) diff --git a/clients/gtest/gemv_batched_gtest.yaml b/clients/gtest/gemv_batched_gtest.yaml deleted file mode 100644 index e0eeedd3d..000000000 --- a/clients/gtest/gemv_batched_gtest.yaml +++ /dev/null @@ -1,91 +0,0 @@ ---- -include: rocblas_common.yaml -include: known_bugs.yaml - -Definitions: - - &small_matrix_size_range - - { M: -1, N: 1, lda: 1 } - - { M: 1, N: -1, lda: 1 } - - { M: 1, N: 1, lda: 0 } - - { M: 10, N: 10, lda: 9 } - - { M: 0, N: 1, lda: 1 } - - { M: 1, N: 0, lda: 1 } - - { M: -1, N: -1, lda: -1 } - - { M: 10, N: 10, lda: 2 } - - { M: 100, N: 200, lda: 200 } - - - &medium_matrix_size_range - - { M: 300, N: 400, lda: 400 } - - { M: 600, N: 500, lda: 601 } - - - &large_matrix_size_range - - { M: 1000, N: 1000, lda: 1000 } - - { M: 2000, N: 2000, lda: 2000 } - - { M: 4011, N: 4011, lda: 4011 } - - { M: 8000, N: 8000, lda: 8000 } - - - &incx_incy_range - - { incx: 2, incy: 1 } - - { incx: -1, incy: 2 } - - { incx: 1, incy: 1 } - - { incx: -1, incy: 3 } - - { incx: 3, incy: -1 } - - { incx: 0, incy: 1 } - - { incx: 1, incy: 0 } - - { incx: 0, incy: -1 } - - { incx: 10, incy: 100 } - - - &alpha_beta_range - - { alpha: 2.0, beta: 0.0 } - - { alpha: -1.0, beta: -1.0 } - - { alpha: 2.0, beta: 1.0 } - - { alpha: 0.0, beta: 1.0 } - -Tests: -- name: gemv_batched_bad_arg - category: pre_checkin - function: gemv_batched_bad_arg - precision: *single_double_precisions - transA: N - -- name: gemv_batched_NaN - category: pre_checkin - function: gemv_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *medium_matrix_size_range - incx_incy: *incx_incy_range - alpha: [ -1.0, 0, 1.0, 2.0 ] - beta: .NaN # converted to 0.0 in test code - batch_count: [ -1, 0, 1, 3 ] - -- name: gemv_batched_small - category: quick - function: gemv_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *small_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - batch_count: [ -1, 0, 1, 3 ] - -- name: gemv_batched_medium - category: pre_checkin - function: gemv_batched - precision: *single_double_precisions_complex_real - transA: [ N, T, C ] - matrix_size: *medium_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - batch_count: [ 3 ] - -- name: gemv_batched_large - category: nightly - function: gemv_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *large_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - batch_count: [ 3 ] -... diff --git a/clients/gtest/gemv_gtest.yaml b/clients/gtest/gemv_gtest.yaml index 9dda01aee..db8b6ea6f 100644 --- a/clients/gtest/gemv_gtest.yaml +++ b/clients/gtest/gemv_gtest.yaml @@ -4,36 +4,36 @@ include: known_bugs.yaml Definitions: - &small_matrix_size_range - - { M: -1, N: 1, lda: 1 } - - { M: 1, N: -1, lda: 1 } - - { M: 1, N: 1, lda: 0 } - - { M: 10, N: 10, lda: 9 } - - { M: 0, N: 1, lda: 1 } - - { M: 1, N: 0, lda: 1 } - - { M: -1, N: -1, lda: -1 } - - { M: 10, N: 10, lda: 2 } - - { M: 100, N: 200, lda: 200 } + - { M: -1, N: 1, lda: 1, stride_a: 1 } + - { M: 1, N: -1, lda: 1, stride_a: 1 } + - { M: 1, N: 1, lda: 0, stride_a: 1 } + - { M: 10, N: 10, lda: 9, stride_a: 1 } + - { M: 0, N: 1, lda: 1, stride_a: 1 } + - { M: 1, N: 0, lda: 1, stride_a: 1 } + - { M: -1, N: -1, lda: -1, stride_a: 1 } + - { M: 10, N: 10, lda: 2, stride_a: 1 } + - { M: 100, N: 200, lda: 200, stride_a: 40000 } - &medium_matrix_size_range - - { M: 300, N: 400, lda: 400 } - - { M: 600, N: 500, lda: 601 } + - { M: 300, N: 400, lda: 400, stride_a: 160000 } + - { M: 600, N: 500, lda: 601, stride_a: 301000 } - &large_matrix_size_range - - { M: 1000, N: 1000, lda: 1000 } - - { M: 2000, N: 2000, lda: 2000 } - - { M: 4011, N: 4011, lda: 4011 } - - { M: 8000, N: 8000, lda: 8000 } + - { M: 1000, N: 1000, lda: 1000, stride_a: 1000000 } + - { M: 2000, N: 2000, lda: 2000, stride_a: 4000000 } + - { M: 4011, N: 4011, lda: 4011, stride_a: 16088200 } + - { M: 8000, N: 8000, lda: 8000, stride_a: 64000000 } - &incx_incy_range - - { incx: 2, incy: 1 } - - { incx: -1, incy: 2 } - - { incx: 1, incy: 1 } - - { incx: -1, incy: 3 } - - { incx: 3, incy: -1 } - - { incx: 0, incy: 1 } - - { incx: 1, incy: 0 } - - { incx: 0, incy: -1 } - - { incx: 10, incy: 100 } + - { incx: 2, incy: 1, stride_scale: 1 } + - { incx: -1, incy: 2, stride_scale: 1 } + - { incx: 1, incy: 1, stride_scale: 1 } + - { incx: -1, incy: 3, stride_scale: 1.5 } + - { incx: 3, incy: -1, stride_scale: 1 } + - { incx: 0, incy: 1, stride_scale: 1 } + - { incx: 1, incy: 0, stride_scale: 1 } + - { incx: 0, incy: -1, stride_scale: 2 } + - { incx: 10, incy: 100, stride_scale: 1 } - &alpha_beta_range - { alpha: 2.0, beta: 0.0, alphai: 1.5, betai: 0.5 } @@ -84,4 +84,98 @@ Tests: matrix_size: *large_matrix_size_range incx_incy: *incx_incy_range alpha_beta: *alpha_beta_range + +- name: gemv_batched_bad_arg + category: pre_checkin + function: gemv_batched_bad_arg + precision: *single_double_precisions + transA: N + +- name: gemv_batched_NaN + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha: [ -1.0, 0, 1.0, 2.0 ] + beta: .NaN # converted to 0.0 in test code + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_small + category: quick + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *small_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_batched_medium + category: pre_checkin + function: gemv_batched + precision: *single_double_precisions_complex_real + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] + +- name: gemv_batched_large + category: nightly + function: gemv_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *large_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] + +- name: gemv_strided_batched_bad_arg + category: pre_checkin + function: gemv_strided_batched_bad_arg + precision: *single_double_precisions + transA: N + +- name: gemv_strided_batched_NaN + category: pre_checkin + function: gemv_strided_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha: [ -1.0, 0, 1.0, 2.0 ] + beta: .NaN # converted to 0.0 in test code + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_strided_batched_small + category: quick + function: gemv_strided_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *small_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ -1, 0, 1, 3 ] + +- name: gemv_strided_batched_medium + category: pre_checkin + function: gemv_strided_batched + precision: *single_double_precisions_complex_real + transA: [ N, T, C ] + matrix_size: *medium_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] + +- name: gemv_strided_batched_large + category: nightly + function: gemv_strided_batched + precision: *single_double_precisions + transA: [ N, T, C ] + matrix_size: *large_matrix_size_range + incx_incy: *incx_incy_range + alpha_beta: *alpha_beta_range + batch_count: [ 3 ] ... diff --git a/clients/gtest/gemv_strided_batched_gtest.yaml b/clients/gtest/gemv_strided_batched_gtest.yaml deleted file mode 100644 index 46a14601d..000000000 --- a/clients/gtest/gemv_strided_batched_gtest.yaml +++ /dev/null @@ -1,95 +0,0 @@ ---- -include: rocblas_common.yaml -include: known_bugs.yaml - -Definitions: - - &small_matrix_size_range - - { M: -1, N: 1, lda: 1, stride_a: 1 } - - { M: 1, N: -1, lda: 1, stride_a: 1 } - - { M: 1, N: 1, lda: 0, stride_a: 1 } - - { M: 10, N: 10, lda: 9, stride_a: 1 } - - { M: 0, N: 1, lda: 1, stride_a: 1 } - - { M: 1, N: 0, lda: 1, stride_a: 1 } - - { M: -1, N: -1, lda: -1, stride_a: 1 } - - { M: 10, N: 10, lda: 2, stride_a: 1 } - - { M: 100, N: 200, lda: 200, stride_a: 40000 } - - - &medium_matrix_size_range - - { M: 300, N: 400, lda: 400, stride_a: 160000 } - - { M: 600, N: 500, lda: 601, stride_a: 301000 } - - - &large_matrix_size_range - - { M: 1000, N: 1000, lda: 1000, stride_a: 1000000 } - - { M: 2000, N: 2000, lda: 2000, stride_a: 4000000 } - - { M: 4011, N: 4011, lda: 4011, stride_a: 16088200 } - - { M: 8000, N: 8000, lda: 8000, stride_a: 64000000 } - - - &incx_incy_range - - { incx: 2, incy: 1 } - - { incx: -1, incy: 2 } - - { incx: 1, incy: 1 } - - { incx: -1, incy: 3 } - - { incx: 3, incy: -1 } - - { incx: 0, incy: 1 } - - { incx: 1, incy: 0 } - - { incx: 0, incy: -1 } - - { incx: 10, incy: 100 } - - - &alpha_beta_range - - { alpha: 2.0, beta: 0.0 } - - { alpha: -1.0, beta: -1.0 } - - { alpha: 2.0, beta: 1.0 } - - { alpha: 0.0, beta: 1.0 } - -Tests: -- name: gemv_strided_batched_bad_arg - category: pre_checkin - function: gemv_strided_batched_bad_arg - precision: *single_double_precisions - transA: N - -- name: gemv_strided_batched_NaN - category: pre_checkin - function: gemv_strided_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *medium_matrix_size_range - incx_incy: *incx_incy_range - alpha: [ -1.0, 0, 1.0, 2.0 ] - beta: .NaN # converted to 0.0 in test code - stride_scale: [ 1 ] - batch_count: [ -1, 0, 1, 3 ] - -- name: gemv_strided_batched_small - category: quick - function: gemv_strided_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *small_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - stride_scale: [ 1 ] - batch_count: [ -1, 0, 1, 3 ] - -- name: gemv_strided_batched_medium - category: pre_checkin - function: gemv_strided_batched - precision: *single_double_precisions_complex_real - transA: [ N, T, C ] - matrix_size: *medium_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - stride_scale: [ 1 ] - batch_count: [ 3 ] - -- name: gemv_strided_batched_large - category: nightly - function: gemv_strided_batched - precision: *single_double_precisions - transA: [ N, T, C ] - matrix_size: *large_matrix_size_range - incx_incy: *incx_incy_range - alpha_beta: *alpha_beta_range - stride_scale: [ 1 ] - batch_count: [ 3 ] -... diff --git a/clients/gtest/rocblas_gtest.yaml b/clients/gtest/rocblas_gtest.yaml index 3cbf2a1ac..5c4057fbd 100644 --- a/clients/gtest/rocblas_gtest.yaml +++ b/clients/gtest/rocblas_gtest.yaml @@ -1,7 +1,5 @@ include: blas1_gtest.yaml include: gemv_gtest.yaml -include: gemv_batched_gtest.yaml -include: gemv_strided_batched_gtest.yaml include: gemm_gtest.yaml include :gemm_batched_gtest.yaml include: gemm_strided_batched_gtest.yaml From 11e61aeadd4e64bc8568dfd2cda097e0bdcabfe5 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 15:51:54 -0600 Subject: [PATCH 05/13] zero strides for ger --- library/src/blas2/rocblas_ger.cpp | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/library/src/blas2/rocblas_ger.cpp b/library/src/blas2/rocblas_ger.cpp index 8f7f76320..02ae5e8f9 100644 --- a/library/src/blas2/rocblas_ger.cpp +++ b/library/src/blas2/rocblas_ger.cpp @@ -88,24 +88,8 @@ namespace if(!m || !n) return rocblas_status_success; - rocblas_ger_template(handle, - m, - n, - alpha, - 0, - x, - 0, - incx, - incx * m, - y, - 0, - incy, - incy * n, - A, - 0, - lda, - lda * n, - 1); + rocblas_ger_template( + handle, m, n, alpha, 0, x, 0, incx, 0, y, 0, incy, 0, A, 0, lda, 0, 1); return rocblas_status_success; } From 602633de2625aef0536f9a538b73c969d7cab1d9 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 16:08:12 -0600 Subject: [PATCH 06/13] ger - using helper to get alpha --- clients/include/testing_ger.hpp | 2 +- clients/include/testing_ger_batched.hpp | 2 +- clients/include/testing_ger_strided_batched.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clients/include/testing_ger.hpp b/clients/include/testing_ger.hpp index 60df009fd..155478cd2 100644 --- a/clients/include/testing_ger.hpp +++ b/clients/include/testing_ger.hpp @@ -60,7 +60,7 @@ void testing_ger(const Arguments& arg) rocblas_int incx = arg.incx; rocblas_int incy = arg.incy; rocblas_int lda = arg.lda; - T h_alpha = (T)arg.alpha; + T h_alpha = arg.get_alpha(); rocblas_local_handle handle; diff --git a/clients/include/testing_ger_batched.hpp b/clients/include/testing_ger_batched.hpp index 0a9698224..746a73faf 100644 --- a/clients/include/testing_ger_batched.hpp +++ b/clients/include/testing_ger_batched.hpp @@ -59,7 +59,7 @@ void testing_ger_batched(const Arguments& arg) rocblas_int incx = arg.incx; rocblas_int incy = arg.incy; rocblas_int lda = arg.lda; - T h_alpha = (T)arg.alpha; + T h_alpha = arg.get_alpha(); rocblas_int batch_count = arg.batch_count; rocblas_local_handle handle; diff --git a/clients/include/testing_ger_strided_batched.hpp b/clients/include/testing_ger_strided_batched.hpp index 2a6d0c6ea..cffcec420 100644 --- a/clients/include/testing_ger_strided_batched.hpp +++ b/clients/include/testing_ger_strided_batched.hpp @@ -116,7 +116,7 @@ void testing_ger_strided_batched(const Arguments& arg) rocblas_int incx = arg.incx; rocblas_int incy = arg.incy; rocblas_int lda = arg.lda; - T h_alpha = (T)arg.alpha; + T h_alpha = arg.get_alpha(); rocblas_int stride_x = arg.stride_x; rocblas_int stride_y = arg.stride_y; rocblas_int stride_a = arg.stride_a; From 8949fa683f3f8ba7be9262f60c4bca208b247913 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Thu, 3 Oct 2019 18:04:23 -0600 Subject: [PATCH 07/13] rename kernel, doc update, reduce duplication --- clients/common/rocblas_gentest.py | 26 ++-- library/include/rocblas-functions.h | 58 +++++++-- library/src/blas2/gemv_device.hpp | 194 ++++++++++++++-------------- library/src/blas2/rocblas_gemv.hpp | 12 +- 4 files changed, 156 insertions(+), 134 deletions(-) diff --git a/clients/common/rocblas_gentest.py b/clients/common/rocblas_gentest.py index fe51d2211..15378b528 100755 --- a/clients/common/rocblas_gentest.py +++ b/clients/common/rocblas_gentest.py @@ -206,29 +206,21 @@ def setdefaults(test): test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * test['stride_scale'])) - if test['function'] in ('ger_strided_batched'): - if all([x in test for x in ('M', 'incx', 'stride_scale')]): - test.setdefault('stride_x', int(test['M'] * abs(test['incx']) * - test['stride_scale'])) - if all([x in test for x in ('N', 'incy', 'stride_scale')]): - test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * - test['stride_scale'])) - - if test['function'] in ('gemv_strided_batched'): - if test['transA'] == 'N': - if all([x in test for x in ('N', 'incx', 'stride_scale')]): - test.setdefault('stride_x', int(test['N'] * abs(test['incx']) * - test['stride_scale'])) - if all([x in test for x in ('M', 'incy', 'stride_scale')]): - test.setdefault('stride_y', int(test['M'] * abs(test['incy']) * - test['stride_scale'])) - else: + if test['function'] in ('gemv_strided_batched', 'ger_strided_batched'): + if test['function'] in ('ger_strided_batched') or test['transA'] in ('T', 'C'): if all([x in test for x in ('M', 'incx', 'stride_scale')]): test.setdefault('stride_x', int(test['M'] * abs(test['incx']) * test['stride_scale'])) if all([x in test for x in ('N', 'incy', 'stride_scale')]): test.setdefault('stride_y', int(test['N'] * abs(test['incy']) * test['stride_scale'])) + else: + if all([x in test for x in ('N', 'incx', 'stride_scale')]): + test.setdefault('stride_x', int(test['N'] * abs(test['incx']) * + test['stride_scale'])) + if all([x in test for x in ('M', 'incy', 'stride_scale')]): + test.setdefault('stride_y', int(test['M'] * abs(test['incy']) * + test['stride_scale'])) test.setdefault('stride_x', 0) test.setdefault('stride_y', 0) diff --git a/library/include/rocblas-functions.h b/library/include/rocblas-functions.h index cd5010c3a..c1b2d886e 100644 --- a/library/include/rocblas-functions.h +++ b/library/include/rocblas-functions.h @@ -194,10 +194,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_batched(rocblas_handle specifies the increment for the elements of x. @param[in] stride_x rocblas_stride - stride form the start of one vector (x_i) and the next one (x_i+1). + stride from the start of one vector (x_i) and the next one (x_i+1). There are no restrictions placed on stride_x, however the user should take care to ensure that stride_x is of appropriate size, for a typical - case this means stride_x > n * incx. + case this means stride_x >= n * incx. @param[in] batch_count specifies the number of batches in x. ********************************************************************/ @@ -386,15 +386,21 @@ ROCBLAS_EXPORT rocblas_status rocblas_zcopy_batched(rocblas_handle specifies the increments for the elements of vectors x_i. @param[in] stridex rocblas_stride - stride form the start of one vector (x_i) and the next one (x_i+1) - @param[in] + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x >= n * incx. + @param[out] y pointer to the first vector (y_0) in the batch stored on the GPU. @param[in] incy rocblas_int specifies the increment for the elements of vectors y_i. @param[in] stridey rocblas_stride - stride from the start of one vector (y_i) and the next one (y_i+1) + stride from the start of one vector (y_i) and the next one (y_i+1). + There are no restrictions placed on stride_y, however the user should + take care to ensure that stride_y is of appropriate size, for a typical + case this means stride_y >= n * incy. stridey should be non zero. @param[in] incy rocblas_int specifies the increment for the elements of y. @@ -658,7 +664,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_zswap_batched(rocblas_handle hand specifies the increment for the elements of x. @param[in] stridex rocblas_stride - specifies the pointer increment between batches for x. + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x >= n * incx. @param[inout] y a pointer to the first vector y_i on the GPU. @param[in] @@ -666,7 +675,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_zswap_batched(rocblas_handle hand specifies the increment for the elements of y. @param[in] stridey rocblas_stride - specifies the pointer increment between batches for y. + stride from the start of one vector (y_i) and the next one (y_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_y is of appropriate size, for a typical + case this means stride_y >= n * incy. stridey should be non zero. @param[in] batch_count rocblas_int number of instances in the batch @@ -895,7 +907,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_dzasum_batched(rocblas_handle specifies the increment for the elements of each x_i. incx must be > 0. @param[in] stridex rocblas_stride - specifies the pointer increment between batches for x. stridex must be be non zero. + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x >= n * incx. @param[out] results pointer to array for storing contiguous batch_count results. either on the host CPU or device GPU. @@ -1054,7 +1069,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_dznrm2_batched(rocblas_handle specifies the increment for the elements of each x_i. incx must be > 0. @param[in] stridex rocblas_stride - specifies the pointer increment between batches for x. stridex must be non zero. + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x >= n * incx. @param[in] batch_count rocblas_int number of instances in the batch @@ -1646,7 +1664,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle specifies the increment for the elements of vectors x_i. @param[in] stridex rocblas_stride - stride form the start of one vector (x_i) and the next one (x_i+1) + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size. When trans equals rocblas_operation_none + this typically means stride_x >= n * incx, otherwise stride_x >= m * incx. @param[in] beta specifies the scalar beta. @param[inout] @@ -1656,7 +1677,10 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle specifies the increment for the elements of vectors y_i. @param[in] stridey rocblas_stride - stride from the start of one vector (y_i) and the next one (y_i+1) + stride from the start of one vector (y_i) and the next one (y_i+1). + There are no restrictions placed on stride_y, however the user should + take care to ensure that stride_y is of appropriate size. When trans equals rocblas_operation_none + this typically means stride_y >= m * incy, otherwise stride_y >= n * incy. stridey should be non zero. @param[in] batch_count rocblas_int number of instances in the batch @@ -2041,15 +2065,21 @@ ROCBLAS_EXPORT rocblas_status rocblas_dger_batched(rocblas_handle handle, specifies the increments for the elements of vectors x_i. @param[in] stridex rocblas_stride - stride form the start of one vector (x_i) and the next one (x_i+1) - @param[in] + stride from the start of one vector (x_i) and the next one (x_i+1). + There are no restrictions placed on stride_x, however the user should + take care to ensure that stride_x is of appropriate size, for a typical + case this means stride_x >= m * incx. + @param[inout] y pointer to the first vector (y_0) in the batch stored on the GPU. @param[in] incy rocblas_int specifies the increment for the elements of vectors y_i. @param[in] stridey rocblas_stride - stride from the start of one vector (y_i) and the next one (y_i+1) + stride from the start of one vector (y_i) and the next one (y_i+1). + There are no restrictions placed on stride_y, however the user should + take care to ensure that stride_y is of appropriate size, for a typical + case this means stride_y >= n * incy. @param[inout] A pointer to the first matrix (A_0) in the batch stored on the GPU. @param[in] diff --git a/library/src/blas2/gemv_device.hpp b/library/src/blas2/gemv_device.hpp index 14a14bfcd..2800b2033 100644 --- a/library/src/blas2/gemv_device.hpp +++ b/library/src/blas2/gemv_device.hpp @@ -12,16 +12,16 @@ template {}, int>::type = 0> -__device__ void gemvn_kernel(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvn_kernel_worker(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int thread_id = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x; @@ -172,16 +172,16 @@ __device__ void gemvn_kernel(rocblas_int m, // Overload for double precision complex numbers. We run out of registers // if we use the above algorithm. template -__device__ void gemvn_kernel(rocblas_int m, - rocblas_int n, - U alpha, - const rocblas_double_complex* A, - rocblas_int lda, - const rocblas_double_complex* x, - rocblas_int incx, - U beta, - rocblas_double_complex* y, - rocblas_int incy) +__device__ void gemvn_kernel_worker(rocblas_int m, + rocblas_int n, + U alpha, + const rocblas_double_complex* A, + rocblas_int lda, + const rocblas_double_complex* x, + rocblas_int incx, + U beta, + rocblas_double_complex* y, + rocblas_int incy) { rocblas_int thread_id = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x; @@ -250,16 +250,16 @@ __device__ void gemvn_kernel(rocblas_int m, } template -__device__ void gemvc_kernel(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvc_kernel_worker(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int tx = hipThreadIdx_x; @@ -317,16 +317,16 @@ __device__ void gemvc_kernel(rocblas_int m, } template -__device__ void gemvt_kernel(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvt_kernel_worker(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int tx = hipThreadIdx_x; @@ -384,24 +384,24 @@ __device__ void gemvt_kernel(rocblas_int m, } template -__global__ void gemvn_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvn_kernel(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { rocblas_int num_threads = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z; if(DIM_X * DIM_Y != num_threads) @@ -414,28 +414,28 @@ __global__ void gemvn_kernel_test(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvn_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvn_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); } template -__global__ void gemvc_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvc_kernel(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); @@ -444,28 +444,28 @@ __global__ void gemvc_kernel_test(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvc_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvc_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); } template -__global__ void gemvt_kernel_test(rocblas_int m, - rocblas_int n, - U alpha_device_host, - rocblas_stride stride_alpha, - const V* Aa, - ptrdiff_t shifta, - rocblas_int lda, - rocblas_stride strideA, - const V* xa, - ptrdiff_t shiftx, - rocblas_int incx, - rocblas_stride stridex, - U beta_device_host, - rocblas_stride stride_beta, - W* ya, - ptrdiff_t shifty, - rocblas_int incy, - rocblas_stride stridey) +__global__ void gemvt_kernel(rocblas_int m, + rocblas_int n, + U alpha_device_host, + rocblas_stride stride_alpha, + const V* Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_stride strideA, + const V* xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_stride stridex, + U beta_device_host, + rocblas_stride stride_beta, + W* ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_stride stridey) { const T* A = load_ptr_batch(Aa, hipBlockIdx_y, shifta, strideA); const T* x = load_ptr_batch(xa, hipBlockIdx_y, shiftx, stridex); @@ -474,7 +474,7 @@ __global__ void gemvt_kernel_test(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvt_kernel(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvt_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); } #endif diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp index 0f91bff4b..3d05d6709 100644 --- a/library/src/blas2/rocblas_gemv.hpp +++ b/library/src/blas2/rocblas_gemv.hpp @@ -55,7 +55,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL((gemvn_kernel_test), + hipLaunchKernelGGL((gemvn_kernel), gemvn_grid, gemvn_threads, 0, @@ -84,7 +84,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL((gemvn_kernel_test), + hipLaunchKernelGGL((gemvn_kernel), gemvn_grid, gemvn_threads, 0, @@ -119,7 +119,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL((gemvt_kernel_test), + hipLaunchKernelGGL((gemvt_kernel), gemvt_grid, gemvt_threads, 0, @@ -148,7 +148,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL((gemvt_kernel_test), + hipLaunchKernelGGL((gemvt_kernel), gemvt_grid, gemvt_threads, 0, @@ -183,7 +183,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(handle->pointer_mode == rocblas_pointer_mode_device) { - hipLaunchKernelGGL((gemvc_kernel_test), + hipLaunchKernelGGL((gemvc_kernel), gemvc_grid, gemvc_threads, 0, @@ -212,7 +212,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, if(!*alpha && *beta == 1) return rocblas_status_success; - hipLaunchKernelGGL((gemvc_kernel_test), + hipLaunchKernelGGL((gemvc_kernel), gemvc_grid, gemvc_threads, 0, From b3f654b9bace6d8fa2e75cfbfdbef1c05255e0d6 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Fri, 4 Oct 2019 12:40:33 -0600 Subject: [PATCH 08/13] add missing file, clean up --- library/src/blas2/rocblas_ger.cpp | 4 - library/src/blas2/rocblas_ger.hpp | 125 ++++++++++++++++++ library/src/blas2/rocblas_ger_batched.cpp | 4 - .../src/blas2/rocblas_ger_strided_batched.cpp | 4 - library/src/include/utility.h | 16 --- 5 files changed, 125 insertions(+), 28 deletions(-) create mode 100644 library/src/blas2/rocblas_ger.hpp diff --git a/library/src/blas2/rocblas_ger.cpp b/library/src/blas2/rocblas_ger.cpp index 02ae5e8f9..e37a6e2e0 100644 --- a/library/src/blas2/rocblas_ger.cpp +++ b/library/src/blas2/rocblas_ger.cpp @@ -84,10 +84,6 @@ namespace if(m < 0 || n < 0 || !incx || !incy || lda < m || lda < 1) return rocblas_status_invalid_size; - // Quick return if possible. Not Argument error - if(!m || !n) - return rocblas_status_success; - rocblas_ger_template( handle, m, n, alpha, 0, x, 0, incx, 0, y, 0, incy, 0, A, 0, lda, 0, 1); diff --git a/library/src/blas2/rocblas_ger.hpp b/library/src/blas2/rocblas_ger.hpp new file mode 100644 index 000000000..fc0997161 --- /dev/null +++ b/library/src/blas2/rocblas_ger.hpp @@ -0,0 +1,125 @@ +/* ************************************************************************ + * Copyright 2016-2019 Advanced Micro Devices, Inc. + * ************************************************************************ */ +#include "handle.h" +#include "logging.h" +#include "rocblas.h" +#include "utility.h" + +template +__global__ void ger_kernel(rocblas_int m, + rocblas_int n, + W alpha_device_host, + rocblas_stride stride_alpha, + const U __restrict__ xa, + ptrdiff_t shiftx, + rocblas_int incx, + rocblas_int stridex, + const U __restrict__ ya, + ptrdiff_t shifty, + rocblas_int incy, + rocblas_int stridey, + V Aa, + ptrdiff_t shifta, + rocblas_int lda, + rocblas_int strideA) +{ + + ptrdiff_t tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + ptrdiff_t ty = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + + if(tx < m && ty < n) + { + auto alpha = load_scalar(alpha_device_host, hipBlockIdx_z, stride_alpha); + T* A = load_ptr_batch(Aa, hipBlockIdx_z, shifta, strideA); + const T* __restrict__ x = load_ptr_batch(xa, hipBlockIdx_z, shiftx, stridex); + const T* __restrict__ y = load_ptr_batch(ya, hipBlockIdx_z, shifty, stridey); + + A[tx + lda * ty] += alpha * x[tx * incx] * y[ty * incy]; + } +} + +template +rocblas_status rocblas_ger_template(rocblas_handle handle, + rocblas_int m, + rocblas_int n, + const W* alpha, + rocblas_stride stride_alpha, + const U* x, + rocblas_int offsetx, + rocblas_int incx, + rocblas_int stridex, + const U* y, + rocblas_int offsety, + rocblas_int incy, + rocblas_int stridey, + V* A, + rocblas_int offsetA, + rocblas_int lda, + rocblas_int strideA, + rocblas_int batch_count) +{ + // Quick return if possible. Not Argument error + if(!m || !n || !batch_count) + return rocblas_status_success; + + hipStream_t rocblas_stream = handle->rocblas_stream; + + // in case of negative inc shift pointer to end of data for negative indexing tid*inc + auto shiftx = incx < 0 ? offsetx - ptrdiff_t(incx) * (m - 1) : offsetx; + auto shifty = incy < 0 ? offsety - ptrdiff_t(incy) * (n - 1) : offsety; + + static constexpr int GEMV_DIM_X = 128; + static constexpr int GEMV_DIM_Y = 8; + rocblas_int blocksX = (m - 1) / GEMV_DIM_X + 1; + rocblas_int blocksY = (n - 1) / GEMV_DIM_Y + 1; + + dim3 grid(blocksX, blocksY, batch_count); + dim3 threads(GEMV_DIM_X, GEMV_DIM_Y); + + if(handle->pointer_mode == rocblas_pointer_mode_device) + hipLaunchKernelGGL(ger_kernel, + grid, + threads, + 0, + rocblas_stream, + m, + n, + alpha, + stride_alpha, + x, + shiftx, + incx, + stridex, + y, + shifty, + incy, + stridey, + A, + offsetA, + lda, + strideA); + else + hipLaunchKernelGGL(ger_kernel, + grid, + threads, + 0, + rocblas_stream, + m, + n, + *alpha, + stride_alpha, + x, + shiftx, + incx, + stridex, + y, + shifty, + incy, + stridey, + A, + offsetA, + lda, + strideA); + return rocblas_status_success; +} diff --git a/library/src/blas2/rocblas_ger_batched.cpp b/library/src/blas2/rocblas_ger_batched.cpp index 60bafbc8e..05c96ba65 100644 --- a/library/src/blas2/rocblas_ger_batched.cpp +++ b/library/src/blas2/rocblas_ger_batched.cpp @@ -111,10 +111,6 @@ namespace if(m < 0 || n < 0 || !incx || !incy || lda < m || lda < 1 || batch_count < 0) return rocblas_status_invalid_size; - // Quick return if possible. Not Argument error - if(!m || !n || !batch_count) - return rocblas_status_success; - rocblas_ger_template( handle, m, n, alpha, 0, x, 0, incx, 0, y, 0, incy, 0, A, 0, lda, 0, batch_count); diff --git a/library/src/blas2/rocblas_ger_strided_batched.cpp b/library/src/blas2/rocblas_ger_strided_batched.cpp index 163eebe3e..0aa59c71d 100644 --- a/library/src/blas2/rocblas_ger_strided_batched.cpp +++ b/library/src/blas2/rocblas_ger_strided_batched.cpp @@ -132,10 +132,6 @@ namespace if(m < 0 || n < 0 || !incx || !incy || lda < m || lda < 1 || batch_count < 0) return rocblas_status_invalid_size; - // Quick return if possible. Not Argument error - if(!m || !n || !batch_count) - return rocblas_status_success; - rocblas_ger_template(handle, m, n, diff --git a/library/src/include/utility.h b/library/src/include/utility.h index 6efaeaee7..e7716b380 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -112,22 +112,6 @@ __forceinline__ __device__ __host__ T* return p[block] + offset; } -// For device pointers -template -__forceinline__ __device__ __host__ T* - load_ptr_batch(T* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) -{ - return p + block * stride + offset; -} - -// For device array of device pointers -template -__forceinline__ __device__ __host__ T* - load_ptr_batch(T* const* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) -{ - return p[block] + offset; -} - #endif // GOOGLE_TEST inline bool isAligned(const void* pointer, size_t byte_count) From 8613a22f2a745a06c5f995f009a1b2d4799ca27c Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Fri, 4 Oct 2019 12:44:31 -0600 Subject: [PATCH 09/13] fix --- library/src/blas2/rocblas_gemv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp index 3d05d6709..9b89e7b1a 100644 --- a/library/src/blas2/rocblas_gemv.hpp +++ b/library/src/blas2/rocblas_gemv.hpp @@ -29,7 +29,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle handle, rocblas_int batch_count) { //quick return - if(!m || !n) + if(!m || !n || !batch_count) return rocblas_status_success; hipStream_t rocblas_stream = handle->rocblas_stream; From caaf634f4d65b85b71148b49bd9dceea2347d094 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Fri, 4 Oct 2019 13:32:47 -0600 Subject: [PATCH 10/13] update gemv calls in trsv --- library/src/blas2/rocblas_trsv.cpp | 151 ++++++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 11 deletions(-) diff --git a/library/src/blas2/rocblas_trsv.cpp b/library/src/blas2/rocblas_trsv.cpp index 1474a1984..b324a7e86 100644 --- a/library/src/blas2/rocblas_trsv.cpp +++ b/library/src/blas2/rocblas_trsv.cpp @@ -130,12 +130,19 @@ namespace BLOCK, &negative_one, A + BLOCK, + 0, lda, + 0, X, + 0, 1, + 0, &one, B + BLOCK * incx, - incx); + 0, + incx, + 0, + 1); // remaining blocks for(i = BLOCK; i < m; i += BLOCK) @@ -148,11 +155,18 @@ namespace jb, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i + BLOCK < m) rocblas_gemv_template(handle, @@ -161,12 +175,19 @@ namespace BLOCK, &negative_one, A + i + BLOCK + i * lda, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B + (i + BLOCK) * incx, - incx); + 0, + incx, + 0, + 1); } } } @@ -183,11 +204,18 @@ namespace jb, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i >= BLOCK) @@ -198,12 +226,19 @@ namespace jb, &negative_one, A + i * lda, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B, - incx); + 0, + incx, + 0, + 1); // remaining blocks for(i = m - jb - BLOCK; i >= 0; i -= BLOCK) @@ -215,11 +250,18 @@ namespace BLOCK, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i >= BLOCK) @@ -229,12 +271,19 @@ namespace BLOCK, &negative_one, A + i * lda, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B, - incx); + 0, + incx, + 0, + 1); } } } @@ -253,11 +302,18 @@ namespace jb, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i - BLOCK >= 0) @@ -268,12 +324,19 @@ namespace i, &negative_one, A + i, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B, - incx); + 0, + incx, + 0, + 1); // remaining blocks for(i = m - jb - BLOCK; i >= 0; i -= BLOCK) @@ -284,11 +347,18 @@ namespace BLOCK, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i >= BLOCK) @@ -298,12 +368,19 @@ namespace i, &negative_one, A + i, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B, - incx); + 0, + incx, + 0, + 1); } } } @@ -311,8 +388,25 @@ namespace { // left, upper transpose jb = min(BLOCK, m); - rocblas_gemv_template( - handle, transA, jb, jb, &one, invA, BLOCK, B, incx, &zero, X, 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA, + 0, + BLOCK, + 0, + B, + 0, + incx, + 0, + &zero, + X, + 0, + 1, + 0, + 1); if(BLOCK < m) { @@ -322,12 +416,19 @@ namespace m - BLOCK, &negative_one, A + BLOCK * lda, + 0, lda, + 0, X, + 0, 1, + 0, &one, B + BLOCK * incx, - incx); + 0, + incx, + 0, + 1); // remaining blocks for(i = BLOCK; i < m; i += BLOCK) @@ -339,11 +440,18 @@ namespace jb, &one, invA + i * BLOCK, + 0, BLOCK, + 0, B + i * incx, + 0, incx, + 0, &zero, X + i, + 0, + 1, + 0, 1); if(i + BLOCK < m) @@ -353,12 +461,19 @@ namespace m - i - BLOCK, &negative_one, A + i + (i + BLOCK) * lda, + 0, lda, + 0, X + i, + 0, 1, + 0, &one, B + (i + BLOCK) * incx, - incx); + 0, + incx, + 0, + 1); } } } @@ -415,11 +530,18 @@ namespace N, &negative_one, A_current, + 0, lda, + 0, B_current, + 0, incx, + 0, &one, x_temp, + 0, + 1, + 0, 1); } @@ -429,12 +551,19 @@ namespace BLOCK, &one, invA + j * BLOCK * BLOCK, + 0, BLOCK, + 0, x_temp, + 0, 1, + 0, &zero, B + j * BLOCK * incx, - incx); + 0, + incx, + 0, + 1); } return rocblas_status_success; From 5797bdaebe63d8e02e6f0a1fc7aca440fb19894c Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Fri, 4 Oct 2019 14:33:18 -0600 Subject: [PATCH 11/13] template fix --- library/src/blas2/rocblas_trsv.cpp | 648 ++++++++++++++--------------- 1 file changed, 324 insertions(+), 324 deletions(-) diff --git a/library/src/blas2/rocblas_trsv.cpp b/library/src/blas2/rocblas_trsv.cpp index b324a7e86..690fa6b0d 100644 --- a/library/src/blas2/rocblas_trsv.cpp +++ b/library/src/blas2/rocblas_trsv.cpp @@ -119,75 +119,75 @@ namespace { // left, lower no-transpose jb = min(BLOCK, m); - rocblas_gemv_template( + rocblas_gemv_template( handle, transA, jb, jb, &one, invA, BLOCK, B, incx, &zero, X, 1); if(BLOCK < m) { - rocblas_gemv_template(handle, - transA, - m - BLOCK, - BLOCK, - &negative_one, - A + BLOCK, - 0, - lda, - 0, - X, - 0, - 1, - 0, - &one, - B + BLOCK * incx, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + m - BLOCK, + BLOCK, + &negative_one, + A + BLOCK, + 0, + lda, + 0, + X, + 0, + 1, + 0, + &one, + B + BLOCK * incx, + 0, + incx, + 0, + 1); // remaining blocks for(i = BLOCK; i < m; i += BLOCK) { jb = min(m - i, BLOCK); - rocblas_gemv_template(handle, - transA, - jb, - jb, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i + BLOCK < m) - rocblas_gemv_template(handle, - transA, - m - i - BLOCK, - BLOCK, - &negative_one, - A + i + BLOCK + i * lda, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B + (i + BLOCK) * incx, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + m - i - BLOCK, + BLOCK, + &negative_one, + A + i + BLOCK + i * lda, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B + (i + BLOCK) * incx, + 0, + incx, + 0, + 1); } } } @@ -198,92 +198,92 @@ namespace i = m - jb; // if m=n=35=lda=ldb, BLOCK =32, then jb = 3, i = 32; {3, 35, 3, 32, 35, 35} - rocblas_gemv_template(handle, - transA, - jb, - jb, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i >= BLOCK) { - rocblas_gemv_template(handle, - transA, - i, - jb, - &negative_one, - A + i * lda, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + i, + jb, + &negative_one, + A + i * lda, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B, + 0, + incx, + 0, + 1); // remaining blocks for(i = m - jb - BLOCK; i >= 0; i -= BLOCK) { //{32, 35, 32, 32, 35, 35} - rocblas_gemv_template(handle, - transA, - BLOCK, - BLOCK, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + BLOCK, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i >= BLOCK) - rocblas_gemv_template(handle, - transA, - i, - BLOCK, - &negative_one, - A + i * lda, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + i, + BLOCK, + &negative_one, + A + i * lda, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B, + 0, + incx, + 0, + 1); } } } @@ -296,91 +296,91 @@ namespace jb = m % BLOCK == 0 ? BLOCK : m % BLOCK; i = m - jb; - rocblas_gemv_template(handle, - transA, - jb, - jb, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i - BLOCK >= 0) { - rocblas_gemv_template(handle, - transA, - jb, - i, - &negative_one, - A + i, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + i, + &negative_one, + A + i, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B, + 0, + incx, + 0, + 1); // remaining blocks for(i = m - jb - BLOCK; i >= 0; i -= BLOCK) { - rocblas_gemv_template(handle, - transA, - BLOCK, - BLOCK, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + BLOCK, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i >= BLOCK) - rocblas_gemv_template(handle, - transA, - BLOCK, - i, - &negative_one, - A + i, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + i, + &negative_one, + A + i, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B, + 0, + incx, + 0, + 1); } } } @@ -388,92 +388,92 @@ namespace { // left, upper transpose jb = min(BLOCK, m); - rocblas_gemv_template(handle, - transA, - jb, - jb, - &one, - invA, - 0, - BLOCK, - 0, - B, - 0, - incx, - 0, - &zero, - X, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA, + 0, + BLOCK, + 0, + B, + 0, + incx, + 0, + &zero, + X, + 0, + 1, + 0, + 1); if(BLOCK < m) { - rocblas_gemv_template(handle, - transA, - BLOCK, - m - BLOCK, - &negative_one, - A + BLOCK * lda, - 0, - lda, - 0, - X, - 0, - 1, - 0, - &one, - B + BLOCK * incx, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + m - BLOCK, + &negative_one, + A + BLOCK * lda, + 0, + lda, + 0, + X, + 0, + 1, + 0, + &one, + B + BLOCK * incx, + 0, + incx, + 0, + 1); // remaining blocks for(i = BLOCK; i < m; i += BLOCK) { jb = min(m - i, BLOCK); - rocblas_gemv_template(handle, - transA, - jb, - jb, - &one, - invA + i * BLOCK, - 0, - BLOCK, - 0, - B + i * incx, - 0, - incx, - 0, - &zero, - X + i, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA + i * BLOCK, + 0, + BLOCK, + 0, + B + i * incx, + 0, + incx, + 0, + &zero, + X + i, + 0, + 1, + 0, + 1); if(i + BLOCK < m) - rocblas_gemv_template(handle, - transA, - BLOCK, - m - i - BLOCK, - &negative_one, - A + i + (i + BLOCK) * lda, - 0, - lda, - 0, - X + i, - 0, - 1, - 0, - &one, - B + (i + BLOCK) * incx, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + m - i - BLOCK, + &negative_one, + A + i + (i + BLOCK) * lda, + 0, + lda, + 0, + X + i, + 0, + 1, + 0, + &one, + B + (i + BLOCK) * incx, + 0, + incx, + 0, + 1); } } } @@ -524,46 +524,46 @@ namespace A_current = parity ? A + BLOCK * ((lda + 1) * q - lda) : A + M * lda; } - rocblas_gemv_template(handle, - transA, - M, - N, - &negative_one, - A_current, - 0, - lda, - 0, - B_current, - 0, - incx, - 0, - &one, - x_temp, - 0, - 1, - 0, - 1); + rocblas_gemv_template(handle, + transA, + M, + N, + &negative_one, + A_current, + 0, + lda, + 0, + B_current, + 0, + incx, + 0, + &one, + x_temp, + 0, + 1, + 0, + 1); } - rocblas_gemv_template(handle, - transA, - BLOCK, - BLOCK, - &one, - invA + j * BLOCK * BLOCK, - 0, - BLOCK, - 0, - x_temp, - 0, - 1, - 0, - &zero, - B + j * BLOCK * incx, - 0, - incx, - 0, - 1); + rocblas_gemv_template(handle, + transA, + BLOCK, + BLOCK, + &one, + invA + j * BLOCK * BLOCK, + 0, + BLOCK, + 0, + x_temp, + 0, + 1, + 0, + &zero, + B + j * BLOCK * incx, + 0, + incx, + 0, + 1); } return rocblas_status_success; From 102e9d9057dda3b3ca118d3eb2657024edfc885f Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Fri, 4 Oct 2019 15:23:50 -0600 Subject: [PATCH 12/13] another fix --- library/src/blas2/rocblas_trsv.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/library/src/blas2/rocblas_trsv.cpp b/library/src/blas2/rocblas_trsv.cpp index 690fa6b0d..19e4ecf0d 100644 --- a/library/src/blas2/rocblas_trsv.cpp +++ b/library/src/blas2/rocblas_trsv.cpp @@ -119,8 +119,25 @@ namespace { // left, lower no-transpose jb = min(BLOCK, m); - rocblas_gemv_template( - handle, transA, jb, jb, &one, invA, BLOCK, B, incx, &zero, X, 1); + rocblas_gemv_template(handle, + transA, + jb, + jb, + &one, + invA, + 0, + BLOCK, + 0, + B, + 0, + incx, + 0, + &zero, + X, + 0, + 1, + 0, + 1); if(BLOCK < m) { From e595eee4621b6a115ce84759cf57d21eb2bb9b44 Mon Sep 17 00:00:00 2001 From: mahmoodw Date: Mon, 7 Oct 2019 14:17:02 -0600 Subject: [PATCH 13/13] naming --- library/src/blas2/gemv_device.hpp | 86 +++++++++++++++---------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/library/src/blas2/gemv_device.hpp b/library/src/blas2/gemv_device.hpp index 2800b2033..efa160fbf 100644 --- a/library/src/blas2/gemv_device.hpp +++ b/library/src/blas2/gemv_device.hpp @@ -12,16 +12,16 @@ template {}, int>::type = 0> -__device__ void gemvn_kernel_worker(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvn_kernel_calc(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int thread_id = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x; @@ -172,16 +172,16 @@ __device__ void gemvn_kernel_worker(rocblas_int m, // Overload for double precision complex numbers. We run out of registers // if we use the above algorithm. template -__device__ void gemvn_kernel_worker(rocblas_int m, - rocblas_int n, - U alpha, - const rocblas_double_complex* A, - rocblas_int lda, - const rocblas_double_complex* x, - rocblas_int incx, - U beta, - rocblas_double_complex* y, - rocblas_int incy) +__device__ void gemvn_kernel_calc(rocblas_int m, + rocblas_int n, + U alpha, + const rocblas_double_complex* A, + rocblas_int lda, + const rocblas_double_complex* x, + rocblas_int incx, + U beta, + rocblas_double_complex* y, + rocblas_int incy) { rocblas_int thread_id = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x; @@ -250,16 +250,16 @@ __device__ void gemvn_kernel_worker(rocblas_int m, } template -__device__ void gemvc_kernel_worker(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvc_kernel_calc(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int tx = hipThreadIdx_x; @@ -317,16 +317,16 @@ __device__ void gemvc_kernel_worker(rocblas_int m, } template -__device__ void gemvt_kernel_worker(rocblas_int m, - rocblas_int n, - U alpha, - const T* A, - rocblas_int lda, - const T* x, - rocblas_int incx, - U beta, - T* y, - rocblas_int incy) +__device__ void gemvt_kernel_calc(rocblas_int m, + rocblas_int n, + U alpha, + const T* A, + rocblas_int lda, + const T* x, + rocblas_int incx, + U beta, + T* y, + rocblas_int incy) { rocblas_int tx = hipThreadIdx_x; @@ -414,7 +414,7 @@ __global__ void gemvn_kernel(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvn_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvn_kernel_calc(m, n, alpha, A, lda, x, incx, beta, y, incy); } template @@ -444,7 +444,7 @@ __global__ void gemvc_kernel(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvc_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvc_kernel_calc(m, n, alpha, A, lda, x, incx, beta, y, incy); } template @@ -474,7 +474,7 @@ __global__ void gemvt_kernel(rocblas_int m, auto alpha = load_scalar(alpha_device_host, hipBlockIdx_y, stride_alpha); auto beta = load_scalar(beta_device_host, hipBlockIdx_y, stride_beta); - gemvt_kernel_worker(m, n, alpha, A, lda, x, incx, beta, y, incy); + gemvt_kernel_calc(m, n, alpha, A, lda, x, incx, beta, y, incy); } #endif