From 598a9e9cfd76fede4f1899e7ae927e8cee81ebab Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Fri, 9 Aug 2019 12:22:27 -0600
Subject: [PATCH 1/5] Enable unit tests for gemv_batched and
 gemv_strided_batched

- Add function templates for gemv_strided_batched and gemv_batched (in rocblas_gemv.hpp)
  to enable correct calls of these functions from other functions or from outside rocblas.
- Add batch and strides checks and quick return in rocblas_gemv_batched.cpp and rocblas_gemv_strided_batched.cpp
- Add unit tests testing_gemv_batched.hpp and testing_gemv_strided_batched.hpp
- Add new class device_batch_vector in rocblas_vector.hpp. Needed for the batched case.
- Add new template headers to rocblas.hpp
- Add new template header and especializations for norm_check_general to work with the batched case (in norm.hpp and norm.cpp)
- Add new template and espcializations for unit_check_general to work with the batched case (in unit.hpp)
- Add new arguments, stride_x and stride_y (needed to test gemv_strided_batched) in rocblas_arguments.hpp and
  rocblas_common.yaml. Set stride_x and stride_y defaults to zero in rocblas_common.yaml to correctly generate the tests of
  those functions that do not need these arguments
- Include the new tests in client.cpp as well as a description of the new arguments
- Add the new functions in rocblas_template.yaml to process YAML from log files
- Add batched and strided_batched template test cases in gemv_gtest.cpp
- Add new yaml test-data files gemv_batched_gtest.yaml and gemv_strided_batched_gtest.yaml
- Include the new yaml files in rocblas_gtest.yaml
- Add the new yaml files to the list of dependencies for rocblas_gtest.data in CMakeLists.txt
---
 clients/benchmarks/client.cpp                 |  16 +
 clients/common/norm.cpp                       | 274 ++++++++++-
 clients/gtest/CMakeLists.txt                  |   2 +-
 clients/gtest/gemv_batched_gtest.yaml         |  91 ++++
 clients/gtest/gemv_gtest.cpp                  | 111 ++++-
 clients/gtest/gemv_strided_batched_gtest.yaml |  91 ++++
 clients/gtest/rocblas_gtest.yaml              |   2 +
 clients/include/norm.hpp                      |  13 +-
 clients/include/rocblas.hpp                   |  59 +++
 clients/include/rocblas_arguments.hpp         |   7 +
 clients/include/rocblas_common.yaml           |   4 +
 clients/include/rocblas_template.yaml         |   4 +
 clients/include/rocblas_vector.hpp            | 158 ++++---
 clients/include/testing_gemv.hpp              |   4 +-
 clients/include/testing_gemv_batched.hpp      | 360 ++++++++++++++
 .../include/testing_gemv_strided_batched.hpp  | 442 ++++++++++++++++++
 clients/include/unit.hpp                      |  89 ++++
 library/src/blas2/gemv_device.hpp             |   1 -
 library/src/blas2/rocblas_gemv.cpp            |   1 -
 library/src/blas2/rocblas_gemv.hpp            | 365 +++++++++++++++
 library/src/blas2/rocblas_gemv_batched.cpp    | 173 +------
 .../blas2/rocblas_gemv_strided_batched.cpp    | 218 ++-------
 22 files changed, 2055 insertions(+), 430 deletions(-)
 create mode 100644 clients/gtest/gemv_batched_gtest.yaml
 create mode 100644 clients/gtest/gemv_strided_batched_gtest.yaml
 create mode 100644 clients/include/testing_gemv_batched.hpp
 create mode 100644 clients/include/testing_gemv_strided_batched.hpp
diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp
index 4ff1679e5..6181075fa 100644
--- a/clients/benchmarks/client.cpp
+++ b/clients/benchmarks/client.cpp
@@ -13,6 +13,8 @@
 #include "testing_dot.hpp"
 #include "testing_geam.hpp"
 #include "testing_gemv.hpp"
+#include "testing_gemv_batched.hpp"
+#include "testing_gemv_strided_batched.hpp"
 #include "testing_ger.hpp"
 #include "testing_iamax_iamin.hpp"
 #include "testing_nrm2.hpp"
@@ -141,6 +143,10 @@ struct perf_blas<
             testing_nrm2<T>(arg);
         else if(!strcmp(arg.function, "gemv"))
             testing_gemv<T>(arg);
+        else if(!strcmp(arg.function, "gemv_batched"))
+            testing_gemv_batched<T>(arg);
+        else if(!strcmp(arg.function, "gemv_strided_batched"))
+            testing_gemv_strided_batched<T>(arg);
         else if(!strcmp(arg.function, "ger"))
             testing_ger<T>(arg);
         else if(!strcmp(arg.function, "syr"))
@@ -500,6 +506,16 @@ try
          "Specific stride of strided_batched matrix D, is only applicable to strided batched"
          "BLAS_EX: second dimension * leading dimension.")
 
+        ("stride_x",
+         value<rocblas_int>(&arg.stride_x)->default_value(128*128),
+         "Specific stride of strided_batched vector x, is only applicable to strided batched"
+         "BLAS_2: second dimension.")
+
+        ("stride_y",
+         value<rocblas_int>(&arg.stride_y)->default_value(128*128),
+         "Specific stride of strided_batched vector y, is only applicable to strided batched"
+         "BLAS_2: leading dimension.")
+
         ("incx",
          value<rocblas_int>(&arg.incx)->default_value(1),
          "increment between values in x vector")
diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp
index b350e6c2d..e7760d42a 100644
--- a/clients/common/norm.cpp
+++ b/clients/common/norm.cpp
@@ -355,7 +355,6 @@ double norm_check_general<rocblas_half>(char          norm_type,
     return cumulative_error;
 }
 
-//=====Norm Check for strided_batched matrix
 template <>
 double norm_check_general(char         norm_type,
                           rocblas_int  M,
@@ -506,6 +505,279 @@ double norm_check_general<double>(char        norm_type,
     return cumulative_error;
 }
 
+template <>
+double norm_check_general<rocblas_float_complex>(char        norm_type,
+                                  rocblas_int M,
+                                  rocblas_int N,
+                                  rocblas_int lda,
+                                  rocblas_int stride_a,
+                                  rocblas_int batch_count,
+                                  rocblas_float_complex*     hCPU,
+                                  rocblas_float_complex*     hGPU)
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    float      work;
+    rocblas_int incx  = 1;
+    float      alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        float cpu_norm = clange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
+
+        caxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
+
+        float error = clange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+template <>
+double norm_check_general<rocblas_double_complex>(char        norm_type,
+                                  rocblas_int M,
+                                  rocblas_int N,
+                                  rocblas_int lda,
+                                  rocblas_int stride_a,
+                                  rocblas_int batch_count,
+                                  rocblas_double_complex*     hCPU,
+                                  rocblas_double_complex*     hGPU)
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    double      work;
+    rocblas_int incx  = 1;
+    double      alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        double cpu_norm = zlange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
+
+        zaxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
+
+        double error = zlange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+//=====Norm Check for batched matrix
+template <>
+double norm_check_general<float>(char               norm_type,
+                                 rocblas_int        M,
+                                 rocblas_int        N,
+                                 rocblas_int        lda,
+                                 rocblas_int        batch_count,
+                                 host_vector<float> hCPU[],
+                                 host_vector<float> hGPU[])
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    float       work;
+    rocblas_int incx  = 1;
+    float       alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        float cpu_norm = slange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
+
+        saxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
+
+        float error = slange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+template <>
+double norm_check_general<double>(char                norm_type,
+                                  rocblas_int         M,
+                                  rocblas_int         N,
+                                  rocblas_int         lda,
+                                  rocblas_int         batch_count,
+                                  host_vector<double> hCPU[],
+                                  host_vector<double> hGPU[])
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    double      work;
+    rocblas_int incx  = 1;
+    double      alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        double cpu_norm = dlange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
+
+        daxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
+
+        double error = dlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+template <>
+double norm_check_general<rocblas_float_complex>(char               norm_type,
+                                 rocblas_int        M,
+                                 rocblas_int        N,
+                                 rocblas_int        lda,
+                                 rocblas_int        batch_count,
+                                 host_vector<rocblas_float_complex> hCPU[],
+                                 host_vector<rocblas_float_complex> hGPU[])
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    float       work;
+    rocblas_int incx  = 1;
+    float       alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        float cpu_norm = clange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
+
+        caxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
+
+        float error = clange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+template <>
+double norm_check_general<rocblas_double_complex>(char                norm_type,
+                                  rocblas_int         M,
+                                  rocblas_int         N,
+                                  rocblas_int         lda,
+                                  rocblas_int         batch_count,
+                                  host_vector<rocblas_double_complex> hCPU[],
+                                  host_vector<rocblas_double_complex> hGPU[])
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    double      work;
+    rocblas_int incx  = 1;
+    double      alpha = -1.0f;
+    rocblas_int size  = lda * N;
+
+    double cumulative_error = 0.0;
+
+    for(int i = 0; i < batch_count; i++)
+    {
+        double cpu_norm = zlange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
+
+        zaxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
+
+        double error = zlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
 /* ============================Norm Check for Symmetric Matrix: float/double/complex template
  * speciliazation ======================================= */
 
diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt
index a6c602bda..6f073afe1 100644
--- a/clients/gtest/CMakeLists.txt
+++ b/clients/gtest/CMakeLists.txt
@@ -113,7 +113,7 @@ endif( )
 set( ROCBLAS_TEST_DATA "${PROJECT_BINARY_DIR}/staging/rocblas_gtest.data")
 add_custom_command( OUTPUT "${ROCBLAS_TEST_DATA}"
                     COMMAND ../common/rocblas_gentest.py -I ../include rocblas_gtest.yaml -o "${ROCBLAS_TEST_DATA}"
-                    DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml
+                    DEPENDS ../common/rocblas_gentest.py rocblas_gtest.yaml ../include/rocblas_common.yaml known_bugs.yaml blas1_gtest.yaml gemm_gtest.yaml gemm_strided_batched_gtest.yaml gemv_gtest.yaml gemv_batched_gtest.yaml gemv_strided_batched_gtest.yaml symv_gtest.yaml syr_gtest.yaml ger_gtest.yaml trsm_gtest.yaml trtri_gtest.yaml geam_gtest.yaml set_get_vector_gtest.yaml set_get_matrix_gtest.yaml trsv_gtest.yaml logging_mode_gtest.yaml set_get_pointer_mode_gtest.yaml
                     WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" )
 add_custom_target( rocblas-test-data
                    DEPENDS "${ROCBLAS_TEST_DATA}" )
diff --git a/clients/gtest/gemv_batched_gtest.yaml b/clients/gtest/gemv_batched_gtest.yaml
new file mode 100644
index 000000000..e0eeedd3d
--- /dev/null
+++ b/clients/gtest/gemv_batched_gtest.yaml
@@ -0,0 +1,91 @@
+---
+include: rocblas_common.yaml
+include: known_bugs.yaml
+
+Definitions:
+  - &small_matrix_size_range
+    - { M:    -1, N:     1, lda:    1 }
+    - { M:     1, N:    -1, lda:    1 }
+    - { M:     1, N:     1, lda:    0 }
+    - { M:    10, N:    10, lda:    9 }
+    - { M:     0, N:     1, lda:    1 }
+    - { M:     1, N:     0, lda:    1 }
+    - { M:    -1, N:    -1, lda:   -1 }
+    - { M:    10, N:    10, lda:    2 }
+    - { M:   100, N:   200, lda:  200 }
+
+  - &medium_matrix_size_range
+    - { M:   300, N:   400, lda:  400 }
+    - { M:   600, N:   500, lda:  601 }
+
+  - &large_matrix_size_range
+    - { M:  1000, N:  1000, lda: 1000 }
+    - { M:  2000, N:  2000, lda: 2000 }
+    - { M:  4011, N:  4011, lda: 4011 }
+    - { M:  8000, N:  8000, lda: 8000 }
+
+  - &incx_incy_range
+    - { incx:   2, incy:   1 }
+    - { incx:  -1, incy:   2 }
+    - { incx:   1, incy:   1 }
+    - { incx:  -1, incy:   3 }
+    - { incx:   3, incy:  -1 }
+    - { incx:   0, incy:   1 }
+    - { incx:   1, incy:   0 }
+    - { incx:   0, incy:  -1 }
+    - { incx:  10, incy: 100 }
+
+  - &alpha_beta_range
+    - { alpha:  2.0, beta:  0.0 }
+    - { alpha: -1.0, beta: -1.0 }
+    - { alpha:  2.0, beta:  1.0 }
+    - { alpha:  0.0, beta:  1.0 }
+
+Tests:
+- name: gemv_batched_bad_arg
+  category: pre_checkin
+  function: gemv_batched_bad_arg
+  precision: *single_double_precisions
+  transA: N
+
+- name: gemv_batched_NaN
+  category: pre_checkin
+  function: gemv_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *medium_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha: [ -1.0, 0, 1.0, 2.0 ]
+  beta: .NaN  # converted to 0.0 in test code
+  batch_count: [ -1, 0, 1, 3 ]
+
+- name: gemv_batched_small
+  category: quick
+  function: gemv_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *small_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ -1, 0, 1, 3 ]
+
+- name: gemv_batched_medium
+  category: pre_checkin
+  function: gemv_batched
+  precision: *single_double_precisions_complex_real
+  transA: [ N, T, C ]
+  matrix_size: *medium_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ 3 ]
+
+- name: gemv_batched_large
+  category: nightly
+  function: gemv_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *large_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ 3 ]
+...
diff --git a/clients/gtest/gemv_gtest.cpp b/clients/gtest/gemv_gtest.cpp
index ff86b426d..fd9823333 100644
--- a/clients/gtest/gemv_gtest.cpp
+++ b/clients/gtest/gemv_gtest.cpp
@@ -6,6 +6,8 @@
 #include "rocblas_datatype2string.hpp"
 #include "rocblas_test.hpp"
 #include "testing_gemv.hpp"
+#include "testing_gemv_batched.hpp"
+#include "testing_gemv_strided_batched.hpp"
 #include "type_dispatch.hpp"
 #include <cctype>
 #include <cstring>
@@ -13,6 +15,69 @@
 
 namespace
 {
+    // possible gemv test cases
+    enum gemv_test_type
+    {
+        GEMV,
+        GEMV_BATCHED,
+        GEMV_STRIDED_BATCHED,
+    };
+
+    //gemv test template
+    template <template <typename...> class FILTER, gemv_test_type GEMV_TYPE>
+    struct gemv_template : RocBLAS_Test<gemv_template<FILTER, GEMV_TYPE>, FILTER>
+    {
+        // Filter for which types apply to this suite
+        static bool type_filter(const Arguments& arg)
+        {
+            return rocblas_simple_dispatch<gemv_template::template type_filter_functor>(arg);
+        }
+
+        // Filter for which functions apply to this suite
+        static bool function_filter(const Arguments& arg)
+        {
+            switch(GEMV_TYPE)
+            {
+            case GEMV:
+                return !strcmp(arg.function, "gemv") || !strcmp(arg.function, "gemv_bad_arg");
+            case GEMV_BATCHED:
+                return !strcmp(arg.function, "gemv_batched")
+                       || !strcmp(arg.function, "gemv_batched_bad_arg");
+            case GEMV_STRIDED_BATCHED:
+                return !strcmp(arg.function, "gemv_strided_batched")
+                       || !strcmp(arg.function, "gemv_strided_batched_bad_arg");
+            }
+            return false;
+        }
+
+        // Google Test name suffix based on parameters
+        static std::string name_suffix(const Arguments& arg)
+        {
+            RocBLAS_TestName<gemv_template> name;
+
+            name << rocblas_datatype2string(arg.a_type) << '_' << (char)std::toupper(arg.transA)
+                 << '_' << arg.M << '_' << arg.N << '_' << arg.alpha << '_' << arg.lda;
+
+            if(GEMV_TYPE == GEMV_STRIDED_BATCHED)
+                name << '_' << arg.stride_a;
+
+            name << '_' << arg.incx;
+
+            if(GEMV_TYPE == GEMV_STRIDED_BATCHED)
+                name << '_' << arg.stride_x;
+
+            name << '_' << arg.beta << '_' << arg.incy;
+
+            if(GEMV_TYPE == GEMV_STRIDED_BATCHED)
+                name << '_' << arg.stride_y;
+
+            if(GEMV_TYPE == GEMV_STRIDED_BATCHED || GEMV_TYPE == GEMV_BATCHED)
+                name << '_' << arg.batch_count;
+
+            return std::move(name);
+        }
+    };
+
     // By default, arbitrary type combinations are invalid.
     // The unnamed second parameter is used for enable_if below.
     template <typename, typename = void>
@@ -39,40 +104,38 @@ namespace
                 testing_gemv<T>(arg);
             else if(!strcmp(arg.function, "gemv_bad_arg"))
                 testing_gemv_bad_arg<T>(arg);
+            else if(!strcmp(arg.function, "gemv_batched"))
+                testing_gemv_batched<T>(arg);
+            else if(!strcmp(arg.function, "gemv_batched_bad_arg"))
+                testing_gemv_batched_bad_arg<T>(arg);
+            else if(!strcmp(arg.function, "gemv_strided_batched"))
+                testing_gemv_strided_batched<T>(arg);
+            else if(!strcmp(arg.function, "gemv_strided_batched_bad_arg"))
+                testing_gemv_strided_batched_bad_arg<T>(arg);
             else
                 FAIL() << "Internal error: Test called with unknown function: " << arg.function;
         }
     };
 
-    struct gemv : RocBLAS_Test<gemv, gemv_testing>
+    using gemv = gemv_template<gemv_testing, GEMV>;
+    TEST_P(gemv, blas2)
     {
-        // Filter for which types apply to this suite
-        static bool type_filter(const Arguments& arg)
-        {
-            return rocblas_simple_dispatch<type_filter_functor>(arg);
-        }
-
-        // Filter for which functions apply to this suite
-        static bool function_filter(const Arguments& arg)
-        {
-            return !strcmp(arg.function, "gemv") || !strcmp(arg.function, "gemv_bad_arg");
-        }
+        rocblas_simple_dispatch<gemv_testing>(GetParam());
+    }
+    INSTANTIATE_TEST_CATEGORIES(gemv);
 
-        // Google Test name suffix based on parameters
-        static std::string name_suffix(const Arguments& arg)
-        {
-            return RocBLAS_TestName<gemv>{}
-                   << rocblas_datatype2string(arg.a_type) << '_' << (char)std::toupper(arg.transA)
-                   << '_' << arg.M << '_' << arg.N << '_' << arg.alpha << '_' << arg.alphai << '_'
-                   << arg.lda << '_' << arg.incx << '_' << arg.beta << '_' << arg.betai << '_'
-                   << arg.incy;
-        }
-    };
+    using gemv_batched = gemv_template<gemv_testing, GEMV_BATCHED>;
+    TEST_P(gemv_batched, blas2)
+    {
+        rocblas_simple_dispatch<gemv_testing>(GetParam());
+    }
+    INSTANTIATE_TEST_CATEGORIES(gemv_batched);
 
-    TEST_P(gemv, blas2)
+    using gemv_strided_batched = gemv_template<gemv_testing, GEMV_STRIDED_BATCHED>;
+    TEST_P(gemv_strided_batched, blas2)
     {
         rocblas_simple_dispatch<gemv_testing>(GetParam());
     }
-    INSTANTIATE_TEST_CATEGORIES(gemv);
+    INSTANTIATE_TEST_CATEGORIES(gemv_strided_batched);
 
 } // namespace
diff --git a/clients/gtest/gemv_strided_batched_gtest.yaml b/clients/gtest/gemv_strided_batched_gtest.yaml
new file mode 100644
index 000000000..7a4ddbd2e
--- /dev/null
+++ b/clients/gtest/gemv_strided_batched_gtest.yaml
@@ -0,0 +1,91 @@
+---
+include: rocblas_common.yaml
+include: known_bugs.yaml
+
+Definitions:
+  - &small_matrix_size_range
+    - { M:    -1, N:     1, lda:    1, stride_a:        1 }
+    - { M:     1, N:    -1, lda:    1, stride_a:        1 }
+    - { M:     1, N:     1, lda:    0, stride_a:        1 }
+    - { M:    10, N:    10, lda:    9, stride_a:        1 }
+    - { M:     0, N:     1, lda:    1, stride_a:        1 }
+    - { M:     1, N:     0, lda:    1, stride_a:        1 }
+    - { M:    -1, N:    -1, lda:   -1, stride_a:        1 }
+    - { M:    10, N:    10, lda:    2, stride_a:        1 }
+    - { M:   100, N:   200, lda:  200, stride_a:    40000 }
+
+  - &medium_matrix_size_range
+    - { M:   300, N:   400, lda:  400, stride_a:   160000 }
+    - { M:   600, N:   500, lda:  601, stride_a:   301000 }
+
+  - &large_matrix_size_range
+    - { M:  1000, N:  1000, lda: 1000, stride_a:  1000000 }
+    - { M:  2000, N:  2000, lda: 2000, stride_a:  4000000 }
+    - { M:  4011, N:  4011, lda: 4011, stride_a: 16088200 }
+    - { M:  8000, N:  8000, lda: 8000, stride_a: 64000000 }
+
+  - &incx_incy_range
+    - { incx:   2, incy:   1, stride_x: 8000, stride_y: 8000 }
+    - { incx:  -1, incy:   2, stride_x: 8000, stride_y: 8000 }
+    - { incx:   1, incy:   1, stride_x: 8000, stride_y: 8000 }
+    - { incx:  -1, incy:   3, stride_x: 4000, stride_y: 4000 }
+    - { incx:   3, incy:  -1, stride_x: 2000, stride_y: 2000 }
+    - { incx:   0, incy:   1, stride_x: 1000, stride_y: 1000 }
+    - { incx:   1, incy:   0, stride_x: 1000, stride_y: 1000 }
+    - { incx:   0, incy:  -1, stride_x:    1, stride_y:    1 }
+    - { incx:  10, incy: 100, stride_x: 8000, stride_y: 8000 }
+
+  - &alpha_beta_range
+    - { alpha:  2.0, beta:  0.0 }
+    - { alpha: -1.0, beta: -1.0 }
+    - { alpha:  2.0, beta:  1.0 }
+    - { alpha:  0.0, beta:  1.0 }
+
+Tests:
+- name: gemv_strided_batched_bad_arg
+  category: pre_checkin
+  function: gemv_strided_batched_bad_arg
+  precision: *single_double_precisions
+  transA: N
+
+- name: gemv_strided_batched_NaN
+  category: pre_checkin
+  function: gemv_strided_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *medium_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha: [ -1.0, 0, 1.0, 2.0 ]
+  beta: .NaN  # converted to 0.0 in test code
+  batch_count: [ -1, 0, 1, 3 ]
+
+- name: gemv_strided_batched_small
+  category: quick
+  function: gemv_strided_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *small_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ -1, 0, 1, 3 ]
+
+- name: gemv_strided_batched_medium
+  category: pre_checkin
+  function: gemv_strided_batched
+  precision: *single_double_precisions_complex_real
+  transA: [ N, T, C ]
+  matrix_size: *medium_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ 3 ]
+
+- name: gemv_strided_batched_large
+  category: nightly
+  function: gemv_strided_batched
+  precision: *single_double_precisions
+  transA: [ N, T, C ]
+  matrix_size: *large_matrix_size_range
+  incx_incy: *incx_incy_range
+  alpha_beta: *alpha_beta_range
+  batch_count: [ 3 ]
+...
diff --git a/clients/gtest/rocblas_gtest.yaml b/clients/gtest/rocblas_gtest.yaml
index c7814e1f7..601ad4196 100644
--- a/clients/gtest/rocblas_gtest.yaml
+++ b/clients/gtest/rocblas_gtest.yaml
@@ -1,5 +1,7 @@
 include: blas1_gtest.yaml
 include: gemv_gtest.yaml
+include: gemv_batched_gtest.yaml
+include: gemv_strided_batched_gtest.yaml
 include: gemm_gtest.yaml
 include: gemm_strided_batched_gtest.yaml
 include: symv_gtest.yaml
diff --git a/clients/include/norm.hpp b/clients/include/norm.hpp
index 34eeea8e2..7a38e678d 100644
--- a/clients/include/norm.hpp
+++ b/clients/include/norm.hpp
@@ -7,6 +7,7 @@
 #define _NORM_H
 
 #include "rocblas.h"
+#include "rocblas_vector.hpp"
 
 /* =====================================================================
         Norm check: norm(A-B)/norm(A), evaluate relative error
@@ -39,8 +40,18 @@ double norm_check_general(char        norm_type,
                           T*          hCPU,
                           T*          hGPU);
 
-/*! \brief  Template: norm check for hermitian/symmetric Matrix: half/float/double/complex */
+/*! \brief  Template: norm check for batched Matrix: half/float/double/complex */
+template <typename T>
+double norm_check_general(char           norm_type,
+                          rocblas_int    M,
+                          rocblas_int    N,
+                          rocblas_int    lda,
+                          rocblas_int    batch_count,
+                          host_vector<T> hCPU[],
+                          host_vector<T> hGPU[]);
 
+
+/*! \brief  Template: norm check for hermitian/symmetric Matrix: half/float/double/complex */
 template <typename T>
 double norm_check_symmetric(
     char norm_type, char uplo, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU);
diff --git a/clients/include/rocblas.hpp b/clients/include/rocblas.hpp
index fbecd9189..662ced891 100644
--- a/clients/include/rocblas.hpp
+++ b/clients/include/rocblas.hpp
@@ -331,6 +331,65 @@ static constexpr auto rocblas_gemv<rocblas_float_complex> = rocblas_cgemv;
 template <>
 static constexpr auto rocblas_gemv<rocblas_double_complex> = rocblas_zgemv;
 
+// gemv_strided_batched
+template <typename T>
+rocblas_status (*rocblas_gemv_strided_batched)(rocblas_handle    handle,
+                                               rocblas_operation transA,
+                                               rocblas_int       m,
+                                               rocblas_int       n,
+                                               const T*          alpha,
+                                               const T*          A,
+                                               rocblas_int       lda,
+                                               rocblas_int       stride_a,
+                                               const T*          x,
+                                               rocblas_int       incx,
+                                               rocblas_int       stride_x,
+                                               const T*          beta,
+                                               T*                y,
+                                               rocblas_int       incy,
+                                               rocblas_int       stride_y,
+                                               rocblas_int       batch_count);
+
+template <>
+static constexpr auto rocblas_gemv_strided_batched<float> = rocblas_sgemv_strided_batched;
+
+template <>
+static constexpr auto rocblas_gemv_strided_batched<double> = rocblas_dgemv_strided_batched;
+
+template <>
+static constexpr auto rocblas_gemv_strided_batched<rocblas_float_complex> = rocblas_cgemv_strided_batched;
+
+template <>
+static constexpr auto rocblas_gemv_strided_batched<rocblas_double_complex> = rocblas_zgemv_strided_batched;
+
+// gemv_batched
+template <typename T>
+rocblas_status (*rocblas_gemv_batched)(rocblas_handle    handle,
+                                       rocblas_operation transA,
+                                       rocblas_int       m,
+                                       rocblas_int       n,
+                                       const T*          alpha,
+                                       const T* const    A[],
+                                       rocblas_int       lda,
+                                       const T* const    x[],
+                                       rocblas_int       incx,
+                                       const T*          beta,
+                                       T* const          y[],
+                                       rocblas_int       incy,
+                                       rocblas_int       batch_count);
+
+template <>
+static constexpr auto rocblas_gemv_batched<float> = rocblas_sgemv_batched;
+
+template <>
+static constexpr auto rocblas_gemv_batched<double> = rocblas_dgemv_batched;
+
+template <>
+static constexpr auto rocblas_gemv_batched<rocblas_float_complex> = rocblas_cgemv_batched;
+
+template <>
+static constexpr auto rocblas_gemv_batched<rocblas_double_complex> = rocblas_zgemv_batched;
+
 // trsv
 template <typename T>
 rocblas_status (*rocblas_trsv)(rocblas_handle    handle,
diff --git a/clients/include/rocblas_arguments.hpp b/clients/include/rocblas_arguments.hpp
index 72a72f333..06987cea7 100644
--- a/clients/include/rocblas_arguments.hpp
+++ b/clients/include/rocblas_arguments.hpp
@@ -63,6 +63,9 @@ struct Arguments
     rocblas_int stride_c; //  stride_c > ldc * N
     rocblas_int stride_d; //  stride_d > ldd * N
 
+    rocblas_int stride_x;
+    rocblas_int stride_y;
+
     rocblas_int norm_check;
     rocblas_int unit_check;
     rocblas_int timing;
@@ -145,6 +148,8 @@ struct Arguments
         ROCBLAS_FORMAT_CHECK(stride_b);
         ROCBLAS_FORMAT_CHECK(stride_c);
         ROCBLAS_FORMAT_CHECK(stride_d);
+        ROCBLAS_FORMAT_CHECK(stride_x);
+        ROCBLAS_FORMAT_CHECK(stride_y);
         ROCBLAS_FORMAT_CHECK(norm_check);
         ROCBLAS_FORMAT_CHECK(unit_check);
         ROCBLAS_FORMAT_CHECK(timing);
@@ -291,6 +296,8 @@ struct Arguments
         print("stride_b", arg.stride_b);
         print("stride_c", arg.stride_c);
         print("stride_d", arg.stride_d);
+        print("stride_x", arg.stride_x);
+        print("stride_y", arg.stride_y);
         print("algo", arg.algo);
         print("solution_index", arg.solution_index);
         print("flags", arg.flags);
diff --git a/clients/include/rocblas_common.yaml b/clients/include/rocblas_common.yaml
index 2dc712ca2..22efdc51e 100644
--- a/clients/include/rocblas_common.yaml
+++ b/clients/include/rocblas_common.yaml
@@ -208,6 +208,8 @@ Arguments:
   - stride_b: rocblas_int
   - stride_c: rocblas_int
   - stride_d: rocblas_int
+  - stride_x: rocblas_int
+  - stride_y: rocblas_int
   - norm_check: rocblas_int
   - unit_check: rocblas_int
   - timing: rocblas_int
@@ -277,6 +279,8 @@ Defaults:
   stride_b: 0
   stride_c: 0
   stride_d: 0
+  stride_x: 0
+  stride_y: 0
   norm_check: 0
   unit_check: 1
   timing: 0
diff --git a/clients/include/rocblas_template.yaml b/clients/include/rocblas_template.yaml
index ffb6d66e0..87e12e61b 100644
--- a/clients/include/rocblas_template.yaml
+++ b/clients/include/rocblas_template.yaml
@@ -30,6 +30,10 @@ Functions:
   rocblas_dgemv: { function: gemv, <<: *double_precision }
   rocblas_cgemv: { function: gemv, <<: *single_precision_complex }
   rocblas_zgemv: { function: gemv, <<: *double_precision_complex } 
+  rocblas_sgemv_strided_batched: { function: gemv_strided_batched, <<: *single_precision }
+  rocblas_dgemv_strided_batched: { function: gemv_strided_batched, <<: *double_precision }
+  rocblas_sgemv_batched: { function: gemv_batched, <<: *single_precision }
+  rocblas_dgemv_batched: { function: gemv_batched, <<: *double_precision }  
   rocblas_strsv: { function: trsv, <<: *single_precision }
   rocblas_dtrsv: { function: trsv, <<: *double_precision }
   rocblas_ssymv: { function: symv, <<: *single_precision }
diff --git a/clients/include/rocblas_vector.hpp b/clients/include/rocblas_vector.hpp
index 46dbb2f6b..c4efabb38 100644
--- a/clients/include/rocblas_vector.hpp
+++ b/clients/include/rocblas_vector.hpp
@@ -14,110 +14,147 @@
 #include <vector>
 
 /* ============================================================================================ */
-/*! \brief  pseudo-vector class which uses device memory */
-
-template <typename T, size_t PAD = 4096>
-class device_vector
+/*! \brief  base-class to allocate/deallocate device memory */
+template<typename T, size_t PAD, typename U>
+class d_vector
 {
-#ifdef GOOGLE_TEST
-
-    T guard[PAD];
+protected:
+    size_t size, bytes;
+
+    #ifdef GOOGLE_TEST
+    U guard[PAD];
+    d_vector(size_t s) : size(s), bytes((s + PAD*2) * sizeof(T)) {
+        // Initialize guard with random data
+        if (PAD > 0) {
+            rocblas_init_nan(guard, PAD);
+        }
+    }
+    #else
+    d_vector(size_t s) : size(s), bytes(s ? s * sizeof(T) : sizeof(T)) {}
+    #endif
 
-    void device_vector_setup()
+    T* device_vector_setup()
     {
-        if((hipMalloc)(&data, bytes) != hipSuccess)
+        T* d;
+        if((hipMalloc)(&d, bytes) != hipSuccess)
         {
             static char* lc = setlocale(LC_NUMERIC, "");
             fprintf(stderr, "Error allocating %'zu bytes (%zu GB)\n", bytes, bytes >> 30);
-            data = nullptr;
+            d = nullptr;
         }
+        #ifdef GOOGLE_TEST
         else
         {
-            // Initialize guard with random data
-            rocblas_init_nan(guard, PAD);
+            if (PAD > 0) {
+                // Copy guard to device memory before allocated memory
+                hipMemcpy(d, guard, sizeof(guard), hipMemcpyHostToDevice);
 
-            // Copy guard to device memory before allocated memory
-            CHECK_HIP_ERROR(hipMemcpy(data, guard, sizeof(guard), hipMemcpyHostToDevice));
+                // Point to allocated block
+                d += PAD;
 
-            // Point to allocated block
-            data += PAD;
-
-            // Copy guard to device memory after allocated memory
-            CHECK_HIP_ERROR(hipMemcpy(data + size, guard, sizeof(guard), hipMemcpyHostToDevice));
+                // Copy guard to device memory after allocated memory
+                hipMemcpy(d + size, guard, sizeof(guard), hipMemcpyHostToDevice);
+            }
         }
+        #endif
+        return d;
     }
 
-    void device_vector_teardown()
+    void device_vector_teardown(T* d)
     {
-        if(data != nullptr)
+        if(d != nullptr)
         {
-            T host[PAD];
-
-            // Copy device memory after allocated memory to host
-            CHECK_HIP_ERROR(hipMemcpy(host, data + size, sizeof(guard), hipMemcpyDeviceToHost));
+            #ifdef GOOGLE_TEST
+            if (PAD > 0) {
+                U host[PAD];
 
-            // Make sure no corruption has occurred
-            EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0);
+                // Copy device memory after allocated memory to host
+                hipMemcpy(host, d + size, sizeof(guard), hipMemcpyDeviceToHost);
 
-            // Point to guard before allocated memory
-            data -= PAD;
+                // Make sure no corruption has occurred
+                EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0);
 
-            // Copy device memory after allocated memory to host
-            CHECK_HIP_ERROR(hipMemcpy(host, data, sizeof(guard), hipMemcpyDeviceToHost));
+                // Point to guard before allocated memory
+                d -= PAD;
 
-            // Make sure no corruption has occurred
-            EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0);
+                // Copy device memory after allocated memory to host
+                hipMemcpy(host, d, sizeof(guard), hipMemcpyDeviceToHost);
 
+                // Make sure no corruption has occurred
+                EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0);
+            }
+            #endif
             // Free device memory
-            CHECK_HIP_ERROR((hipFree)(data));
+            CHECK_HIP_ERROR((hipFree)(d));
         }
     }
+};
 
+/* ============================================================================================ */
+/*! \brief  pseudo-vector subclass which uses a batch of device memory pointers and 
+            an array of pointers in host memory*/
+template<typename T, size_t PAD = 4096, typename U = T>
+class device_batch_vector : private d_vector<T,PAD,U>
+{
 public:
-    // Must wrap constructor and destructor in functions to allow Google Test macros to work
-    explicit device_vector(size_t size)
-        : size(size)
-        , bytes((size + PAD * 2) * sizeof(T))
+    explicit device_batch_vector(size_t b, size_t s) : batch(b), d_vector<T,PAD,U>(s) 
     {
-        device_vector_setup();
+        data = (T**) malloc(batch*sizeof(T*));
+        for(int b=0;b<batch;++b)
+            data[b] = this->device_vector_setup();
     }
-
-    ~device_vector()
+    
+    ~device_batch_vector()
     {
-        device_vector_teardown();
+        if(data != nullptr) {
+            for(int b=0;b<batch;++b)
+                this->device_vector_teardown(data[b]);
+            free(data);
+        }
     }
+    
+    T* operator [](int n)
+    {
+        return data[n];
+    }
+    
+    operator T**()
+    {
+        return data;
+    }
+    
+    // Disallow copying or assigning
+    device_batch_vector(const device_batch_vector&) = delete;
+    device_batch_vector& operator=(const device_batch_vector&) = delete;
 
-#else // GOOGLE_TEST
-
-    // Code without memory guards
+private:
+    T** data;
+    size_t batch;
+};
 
+/* ============================================================================================ */
+/*! \brief  pseudo-vector subclass which uses device memory */
+template <typename T, size_t PAD = 4096, typename U = T>
+class device_vector : private d_vector<T,PAD,U>
+{
 public:
-    explicit device_vector(size_t size)
-        : size(size)
-        , bytes(size ? size * sizeof(T) : sizeof(T))
+    // Must wrap constructor and destructor in functions to allow Google Test macros to work
+    explicit device_vector(size_t s) : d_vector<T,PAD,U>(s) 
     {
-        if((hipMalloc)(&data, bytes) != hipSuccess)
-        {
-            static char* lc = setlocale(LC_NUMERIC, "");
-            fprintf(stderr, "Error allocating %'zu bytes (%'zu GB)\n", bytes, bytes >> 30);
-            data = nullptr;
-        }
+        data = this->device_vector_setup();
     }
 
     ~device_vector()
     {
-        if(data != nullptr)
-            CHECK_HIP_ERROR((hipFree)(data));
+        this->device_vector_teardown(data);
     }
 
-#endif // GOOGLE_TEST
-
-public:
     // Decay into pointer wherever pointer is expected
     operator T*()
     {
         return data;
     }
+    
     operator const T*() const
     {
         return data;
@@ -135,11 +172,10 @@ class device_vector
 
 private:
     T*           data;
-    const size_t size, bytes;
 };
 
 /* ============================================================================================ */
-/*! \brief  pseudo-vector class which uses host memory */
+/*! \brief  pseudo-vector subclass which uses host memory */
 template <typename T>
 struct host_vector : std::vector<T>
 {
diff --git a/clients/include/testing_gemv.hpp b/clients/include/testing_gemv.hpp
index 1e30abb9e..a48958d1e 100644
--- a/clients/include/testing_gemv.hpp
+++ b/clients/include/testing_gemv.hpp
@@ -254,14 +254,14 @@ void testing_gemv(const Arguments& arg)
         rocblas_bandwidth = (1.0 * M * N) * sizeof(T) / gpu_time_used / 1e3;
 
         // only norm_check return an norm error, unit check won't return anything
-        std::cout << "M,N,alpha,lda,incx,incy,rocblas-Gflops,rocblas-GB/s,";
+        std::cout << "M,N,alpha,lda,incx,beta,incy,rocblas-Gflops,rocblas-GB/s,";
         if(arg.norm_check)
         {
             std::cout << "CPU-Gflops,norm_error_host_ptr,norm_error_device_ptr";
         }
         std::cout << std::endl;
 
-        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << incx << "," << incy
+        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << incx << "," << h_beta << "," << incy
                   << "," << rocblas_gflops << "," << rocblas_bandwidth << ",";
 
         if(arg.norm_check)
diff --git a/clients/include/testing_gemv_batched.hpp b/clients/include/testing_gemv_batched.hpp
new file mode 100644
index 000000000..3f4f914a7
--- /dev/null
+++ b/clients/include/testing_gemv_batched.hpp
@@ -0,0 +1,360 @@
+/* ************************************************************************
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * ************************************************************************ */
+
+#include "cblas_interface.hpp"
+#include "flops.hpp"
+#include "norm.hpp"
+#include "rocblas.hpp"
+#include "rocblas_datatype2string.hpp"
+#include "rocblas_init.hpp"
+#include "rocblas_math.hpp"
+#include "rocblas_random.hpp"
+#include "rocblas_test.hpp"
+#include "rocblas_vector.hpp"
+#include "unit.hpp"
+#include "utility.hpp"
+
+template <typename T>
+void testing_gemv_batched_bad_arg(const Arguments& arg)
+{
+    const rocblas_int M           = 100;
+    const rocblas_int N           = 100;
+    const rocblas_int lda         = 100;
+    const rocblas_int incx        = 1;
+    const rocblas_int incy        = 1;
+    const T           alpha       = 1.0;
+    const T           beta        = 1.0;
+    const rocblas_int batch_count = 5;
+
+    const rocblas_operation transA = rocblas_operation_none;
+
+    rocblas_local_handle handle;
+
+    // allocate memory on device
+    device_vector<T*,0,T> dA(batch_count);
+    device_vector<T*,0,T> dx(batch_count);
+    device_vector<T*,0,T> dy(batch_count);
+    
+    if(!dA || !dx || !dy)
+    {
+        CHECK_HIP_ERROR(hipErrorOutOfMemory);
+        return;
+    }
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            handle, transA, M, N, &alpha, nullptr, lda, dx, incx, &beta, dy, incy, batch_count),
+        rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            handle, transA, M, N, &alpha, dA, lda, nullptr, incx, &beta, dy, incy, batch_count),
+        rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            handle, transA, M, N, &alpha, dA, lda, dx, incx, &beta, nullptr, incy, batch_count),
+        rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            handle, transA, M, N, nullptr, dA, lda, dx, incx, &beta, dy, incy, batch_count),
+        rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            handle, transA, M, N, &alpha, dA, lda, dx, incx, nullptr, dy, incy, batch_count),
+        rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(
+        rocblas_gemv_batched<T>(
+            nullptr, transA, M, N, &alpha, dA, lda, dx, incx, &beta, dy, incy, batch_count),
+        rocblas_status_invalid_handle);
+}
+
+template <typename T>
+void testing_gemv_batched(const Arguments& arg)
+{
+    rocblas_int       M           = arg.M;
+    rocblas_int       N           = arg.N;
+    rocblas_int       lda         = arg.lda;
+    rocblas_int       incx        = arg.incx;
+    rocblas_int       incy        = arg.incy;
+    T                 h_alpha     = arg.get_alpha<T>();
+    T                 h_beta      = arg.get_beta<T>();
+    rocblas_operation transA      = char2rocblas_operation(arg.transA);
+    rocblas_int       batch_count = arg.batch_count;
+
+    rocblas_local_handle handle;
+
+    // argument sanity check before allocating invalid memory
+    if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || batch_count < 0)
+    {
+        device_vector<T*,0,T> dAA1(1);
+        device_vector<T*,0,T> dxA1(1);
+        device_vector<T*,0,T> dy_1A1(1);
+
+        if(!dAA1 || !dxA1 || !dy_1A1)
+        {
+            CHECK_HIP_ERROR(hipErrorOutOfMemory);
+            return;
+        }
+
+        EXPECT_ROCBLAS_STATUS(rocblas_gemv_batched<T>(handle,
+                                                      transA,
+                                                      M,
+                                                      N,
+                                                      &h_alpha,
+                                                      dAA1,
+                                                      lda,
+                                                      dxA1,
+                                                      incx,
+                                                      &h_beta,
+                                                      dy_1A1,
+                                                      incy,
+                                                      batch_count),
+                              rocblas_status_invalid_size);
+
+        return;
+    }
+
+    //quick return
+    if(!M || !N || !batch_count)
+        return;
+
+    //Device-arrays of pointers to device memory
+    device_vector<T*,0,T> dAA(batch_count);
+    device_vector<T*,0,T> dxA(batch_count);
+    device_vector<T*,0,T> dy_1A(batch_count);
+    device_vector<T*,0,T> dy_2A(batch_count);
+
+    if(!dAA || !dxA || !dy_1A || !dy_2A)
+    {
+        CHECK_HIP_ERROR(hipErrorOutOfMemory);
+        return;
+    }
+
+    size_t size_A = lda * static_cast<size_t>(N);
+    size_t size_x, dim_x, abs_incx;
+    size_t size_y, dim_y, abs_incy;
+
+    if(transA == rocblas_operation_none)
+    {
+        dim_x = N;
+        dim_y = M;
+    }
+    else
+    {
+        dim_x = M;
+        dim_y = N;
+    }
+
+    abs_incx = incx >= 0 ? incx : -incx;
+    abs_incy = incy >= 0 ? incy : -incy;
+
+    size_x = dim_x * abs_incx;
+    size_y = dim_y * abs_incy;
+
+    // Host-arrays of pointers to host memory
+    host_vector<T> hAA[batch_count];
+    host_vector<T> hxA[batch_count];
+    host_vector<T> hy_1A[batch_count];
+    host_vector<T> hy_2A[batch_count];
+    host_vector<T> hy_goldA[batch_count];
+    for(int b = 0; b < batch_count; ++b)
+    {
+        hAA[b]      = host_vector<T>(size_A);
+        hxA[b]      = host_vector<T>(size_x);
+        hy_1A[b]    = host_vector<T>(size_y);
+        hy_2A[b]    = host_vector<T>(size_y);
+        hy_goldA[b] = host_vector<T>(size_y);
+    }
+
+    // Host-arrays of pointers to device memory
+    // (intermediate arrays used for the transfers)
+    device_batch_vector<T> AA(batch_count,size_A);
+    device_batch_vector<T> xA(batch_count,size_x);
+    device_batch_vector<T> y_1A(batch_count,size_y);
+    device_batch_vector<T> y_2A(batch_count,size_y);
+
+    device_vector<T> d_alpha(1);
+    device_vector<T> d_beta(1);
+
+    int last = batch_count - 1;
+    if((!AA[last] && size_A) || (!xA[last] && size_x) || ((!y_1A[last] || !y_2A[last]) && size_y)
+       || !d_alpha || !d_beta)
+    {
+        CHECK_HIP_ERROR(hipErrorOutOfMemory);
+        return;
+    }
+
+    // Initial Data on CPU
+    rocblas_seedrand();
+    for(int b = 0; b < batch_count; ++b)
+    {
+        rocblas_init<T>(hAA[b], M, N, lda);
+        rocblas_init<T>(hxA[b], 1, dim_x, abs_incx);
+        if(rocblas_isnan(arg.beta))
+            rocblas_init_nan<T>(hy_1A[b], 1, dim_y, abs_incy);
+        else
+            rocblas_init<T>(hy_1A[b], 1, dim_y, abs_incy);
+        hy_goldA[b] = hy_1A[b];
+        hy_2A[b]    = hy_1A[b];
+    }
+
+    // copy data from CPU to device
+    // 1. Use intermediate arrays to access device memory from host
+    for(int b = 0; b < batch_count; ++b)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(AA[b], hAA[b], sizeof(T) * size_A, hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(xA[b], hxA[b], sizeof(T) * size_x, hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(y_1A[b], hy_1A[b], sizeof(T) * size_y, hipMemcpyHostToDevice));
+    }
+    // 2. Copy intermediate arrays into device arrays
+    CHECK_HIP_ERROR(hipMemcpy(dAA, AA, sizeof(T*) * batch_count, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dxA, xA, sizeof(T*) * batch_count, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dy_1A, y_1A, sizeof(T*) * batch_count, hipMemcpyHostToDevice));
+
+    double gpu_time_used, cpu_time_used;
+    double rocblas_gflops, cblas_gflops, rocblas_bandwidth;
+    double rocblas_error_1;
+    double rocblas_error_2;
+
+    /* =====================================================================
+           ROCBLAS
+    =================================================================== */
+    if(arg.unit_check || arg.norm_check)
+    {
+        for(int b = 0; b < batch_count; ++b)
+        {
+            CHECK_HIP_ERROR(
+                hipMemcpy(y_2A[b], hy_2A[b], sizeof(T) * size_y, hipMemcpyHostToDevice));
+        }
+        CHECK_HIP_ERROR(hipMemcpy(dy_2A, y_2A, sizeof(T*) * batch_count, hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice));
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host));
+        CHECK_ROCBLAS_ERROR(rocblas_gemv_batched<T>(handle,
+                                                    transA,
+                                                    M,
+                                                    N,
+                                                    &h_alpha,
+                                                    dAA,
+                                                    lda,
+                                                    dxA,
+                                                    incx,
+                                                    &h_beta,
+                                                    dy_1A,
+                                                    incy,
+                                                    batch_count));
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device));
+        CHECK_ROCBLAS_ERROR(rocblas_gemv_batched<T>(
+            handle, transA, M, N, d_alpha, dAA, lda, dxA, incx, d_beta, dy_2A, incy, batch_count));
+
+        // copy output from device to CPU
+        // Use intermediate arrays to access device memory from host
+        for(int b = 0; b < batch_count; ++b)
+        {
+            CHECK_HIP_ERROR(
+                hipMemcpy(hy_1A[b], y_1A[b], sizeof(T) * size_y, hipMemcpyDeviceToHost));
+            CHECK_HIP_ERROR(
+                hipMemcpy(hy_2A[b], y_2A[b], sizeof(T) * size_y, hipMemcpyDeviceToHost));
+        }
+
+        // CPU BLAS
+        cpu_time_used = get_time_us();
+        for(int b = 0; b < batch_count; ++b)
+        {
+            cblas_gemv<T>(
+                transA, M, N, h_alpha, hAA[b], lda, hxA[b], incx, h_beta, hy_goldA[b], incy);
+        }
+        cpu_time_used = get_time_us() - cpu_time_used;
+        cblas_gflops  = batch_count * gemv_gflop_count<T>(M, N) / cpu_time_used * 1e6;
+
+        if(arg.unit_check)
+        {
+            unit_check_general<T>(1, dim_y, batch_count, abs_incy, hy_goldA, hy_1A);
+            unit_check_general<T>(1, dim_y, batch_count, abs_incy, hy_goldA, hy_2A);
+        }
+
+        if(arg.norm_check)
+        {
+            rocblas_error_1
+                = norm_check_general<T>('F', 1, dim_y, abs_incy, batch_count, hy_goldA, hy_1A);
+            rocblas_error_2
+                = norm_check_general<T>('F', 1, dim_y, abs_incy, batch_count, hy_goldA, hy_2A);
+        }
+    }
+
+    if(arg.timing)
+    {
+        int number_cold_calls = 2;
+        int number_hot_calls  = 100;
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host));
+
+        for(int iter = 0; iter < number_cold_calls; iter++)
+        {
+            rocblas_gemv_batched<T>(handle,
+                                    transA,
+                                    M,
+                                    N,
+                                    &h_alpha,
+                                    dAA,
+                                    lda,
+                                    dxA,
+                                    incx,
+                                    &h_beta,
+                                    dy_1A,
+                                    incy,
+                                    batch_count);
+        }
+
+        gpu_time_used = get_time_us(); // in microseconds
+
+        for(int iter = 0; iter < number_hot_calls; iter++)
+        {
+            rocblas_gemv_batched<T>(handle,
+                                    transA,
+                                    M,
+                                    N,
+                                    &h_alpha,
+                                    dAA,
+                                    lda,
+                                    dxA,
+                                    incx,
+                                    &h_beta,
+                                    dy_1A,
+                                    incy,
+                                    batch_count);
+        }
+
+        gpu_time_used     = (get_time_us() - gpu_time_used) / number_hot_calls;
+        rocblas_gflops    = batch_count * gemv_gflop_count<T>(M, N) / gpu_time_used * 1e6;
+        rocblas_bandwidth = batch_count * (1.0 * M * N) * sizeof(T) / gpu_time_used / 1e3;
+
+        // only norm_check return an norm error, unit check won't return anything
+        std::cout << "M,N,alpha,lda,incx,beta,incy,batch_count,rocblas-Gflops,rocblas-GB/s,";
+        if(arg.norm_check)
+        {
+            std::cout << "CPU-Gflops,norm_error_host_ptr,norm_error_device_ptr";
+        }
+        std::cout << std::endl;
+
+        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << incx << "," << h_beta
+                  << "," << incy << "," << batch_count << "," << rocblas_gflops << ","
+                  << rocblas_bandwidth << ",";
+
+        if(arg.norm_check)
+        {
+            std::cout << cblas_gflops << ',';
+            std::cout << rocblas_error_1 << ',' << rocblas_error_2;
+        }
+
+        std::cout << std::endl;
+    }
+
+}
diff --git a/clients/include/testing_gemv_strided_batched.hpp b/clients/include/testing_gemv_strided_batched.hpp
new file mode 100644
index 000000000..459877bf2
--- /dev/null
+++ b/clients/include/testing_gemv_strided_batched.hpp
@@ -0,0 +1,442 @@
+/* ************************************************************************
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * ************************************************************************ */
+
+#include "cblas_interface.hpp"
+#include "flops.hpp"
+#include "norm.hpp"
+#include "rocblas.hpp"
+#include "rocblas_datatype2string.hpp"
+#include "rocblas_init.hpp"
+#include "rocblas_math.hpp"
+#include "rocblas_random.hpp"
+#include "rocblas_test.hpp"
+#include "rocblas_vector.hpp"
+#include "unit.hpp"
+#include "utility.hpp"
+
+template <typename T>
+void testing_gemv_strided_batched_bad_arg(const Arguments& arg)
+{
+    const rocblas_int M           = 100;
+    const rocblas_int N           = 100;
+    const rocblas_int lda         = 100;
+    const rocblas_int incx        = 1;
+    const rocblas_int incy        = 1;
+    const T           alpha       = 1.0;
+    const T           beta        = 1.0;
+    const rocblas_int stride_a    = 10000;
+    const rocblas_int stride_x    = 100;
+    const rocblas_int stride_y    = 100;
+    const rocblas_int batch_count = 5;
+
+    const rocblas_operation transA = rocblas_operation_none;
+
+    rocblas_local_handle handle;
+
+    size_t size_A = lda * static_cast<size_t>(N);
+    size_t size_x = N * static_cast<size_t>(incx);
+    size_t size_y = M * static_cast<size_t>(incy);
+
+    // allocate memory on device
+    device_vector<T> dA(size_A);
+    device_vector<T> dx(size_x);
+    device_vector<T> dy(size_y);
+    if(!dA || !dx || !dy)
+    {
+        CHECK_HIP_ERROR(hipErrorOutOfMemory);
+        return;
+    }
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          &alpha,
+                                                          nullptr,
+                                                          lda,
+                                                          stride_a,
+                                                          dx,
+                                                          incx,
+                                                          stride_x,
+                                                          &beta,
+                                                          dy,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          &alpha,
+                                                          dA,
+                                                          lda,
+                                                          stride_a,
+                                                          nullptr,
+                                                          incx,
+                                                          stride_x,
+                                                          &beta,
+                                                          dy,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          &alpha,
+                                                          dA,
+                                                          lda,
+                                                          stride_a,
+                                                          dx,
+                                                          incx,
+                                                          stride_x,
+                                                          &beta,
+                                                          nullptr,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          nullptr,
+                                                          dA,
+                                                          lda,
+                                                          stride_a,
+                                                          dx,
+                                                          incx,
+                                                          stride_x,
+                                                          &beta,
+                                                          dy,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          &alpha,
+                                                          dA,
+                                                          lda,
+                                                          stride_a,
+                                                          dx,
+                                                          incx,
+                                                          stride_x,
+                                                          nullptr,
+                                                          dy,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_pointer);
+
+    EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(nullptr,
+                                                          transA,
+                                                          M,
+                                                          N,
+                                                          &alpha,
+                                                          dA,
+                                                          lda,
+                                                          stride_a,
+                                                          dx,
+                                                          incx,
+                                                          stride_x,
+                                                          &beta,
+                                                          dy,
+                                                          incy,
+                                                          stride_y,
+                                                          batch_count),
+                          rocblas_status_invalid_handle);
+}
+
+template <typename T>
+void testing_gemv_strided_batched(const Arguments& arg)
+{
+    rocblas_int       M           = arg.M;
+    rocblas_int       N           = arg.N;
+    rocblas_int       lda         = arg.lda;
+    rocblas_int       incx        = arg.incx;
+    rocblas_int       incy        = arg.incy;
+    T                 h_alpha     = arg.get_alpha<T>();
+    T                 h_beta      = arg.get_beta<T>();
+    rocblas_operation transA      = char2rocblas_operation(arg.transA);
+    rocblas_int       stride_a    = arg.stride_a;
+    rocblas_int       stride_x    = arg.stride_x;
+    rocblas_int       stride_y    = arg.stride_y;
+    rocblas_int       batch_count = arg.batch_count;
+
+    rocblas_local_handle handle;
+    size_t               size_A = lda * static_cast<size_t>(N);
+    size_t               size_x, dim_x, abs_incx;
+    size_t               size_y, dim_y, abs_incy;
+
+    if(transA == rocblas_operation_none)
+    {
+        dim_x = N;
+        dim_y = M;
+    }
+    else
+    {
+        dim_x = M;
+        dim_y = N;
+    }
+
+    abs_incx = incx >= 0 ? incx : -incx;
+    abs_incy = incy >= 0 ? incy : -incy;
+
+    size_x = dim_x * abs_incx;
+    size_y = dim_y * abs_incy;
+
+    // argument sanity check before allocating invalid memory
+    if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || stride_a < size_A
+       || stride_x < size_x || stride_y < size_y || batch_count < 0)
+    {
+        static const size_t safe_size = 100; // arbitrarily set to 100
+        device_vector<T>    dA1(safe_size);
+        device_vector<T>    dx1(safe_size);
+        device_vector<T>    dy1(safe_size);
+        if(!dA1 || !dx1 || !dy1)
+        {
+            CHECK_HIP_ERROR(hipErrorOutOfMemory);
+            return;
+        }
+
+        EXPECT_ROCBLAS_STATUS(rocblas_gemv_strided_batched<T>(handle,
+                                                              transA,
+                                                              M,
+                                                              N,
+                                                              &h_alpha,
+                                                              dA1,
+                                                              lda,
+                                                              stride_a,
+                                                              dx1,
+                                                              incx,
+                                                              stride_x,
+                                                              &h_beta,
+                                                              dy1,
+                                                              incy,
+                                                              stride_y,
+                                                              batch_count),
+                              rocblas_status_invalid_size);
+
+        return;
+    }
+
+    //quick return
+    if(!M || !N || !batch_count)
+        return;
+
+    size_A = size_A + static_cast<size_t>(stride_a) * static_cast<size_t>(batch_count - 1);
+    size_x = size_x + static_cast<size_t>(stride_x) * static_cast<size_t>(batch_count - 1);
+    size_y = size_y + static_cast<size_t>(stride_y) * static_cast<size_t>(batch_count - 1);
+
+    // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory
+    host_vector<T> hA(size_A);
+    host_vector<T> hx(size_x);
+    host_vector<T> hy_1(size_y);
+    host_vector<T> hy_2(size_y);
+    host_vector<T> hy_gold(size_y);
+
+    device_vector<T> dA(size_A);
+    device_vector<T> dx(size_x);
+    device_vector<T> dy_1(size_y);
+    device_vector<T> dy_2(size_y);
+    device_vector<T> d_alpha(1);
+    device_vector<T> d_beta(1);
+    if((!dA && size_A) || (!dx && size_x) || ((!dy_1 || !dy_2) && size_y) || !d_alpha || !d_beta)
+    {
+        CHECK_HIP_ERROR(hipErrorOutOfMemory);
+        return;
+    }
+
+    // Initial Data on CPU
+    rocblas_seedrand();
+    rocblas_init<T>(hA, M, N, lda, stride_a, batch_count);
+    rocblas_init<T>(hx, 1, dim_x, abs_incx, stride_x, batch_count);
+
+    if(rocblas_isnan(arg.beta))
+        rocblas_init_nan<T>(hy_1, 1, dim_y, abs_incy, stride_y, batch_count);
+    else
+        rocblas_init<T>(hy_1, 1, dim_y, abs_incy, stride_y, batch_count);
+
+    // copy vector is easy in STL; hy_gold = hy_1: save a copy in hy_gold which will be output of
+    // CPU BLAS
+    hy_gold = hy_1;
+    hy_2    = hy_1;
+
+    // copy data from CPU to device
+    CHECK_HIP_ERROR(hipMemcpy(dA, hA, sizeof(T) * size_A, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dx, hx, sizeof(T) * size_x, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1, sizeof(T) * size_y, hipMemcpyHostToDevice));
+
+    double gpu_time_used, cpu_time_used;
+    double rocblas_gflops, cblas_gflops, rocblas_bandwidth;
+    double rocblas_error_1;
+    double rocblas_error_2;
+
+    /* =====================================================================
+           ROCBLAS
+    =================================================================== */
+    if(arg.unit_check || arg.norm_check)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2, sizeof(T) * size_y, hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice));
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host));
+        CHECK_ROCBLAS_ERROR(rocblas_gemv_strided_batched<T>(handle,
+                                                            transA,
+                                                            M,
+                                                            N,
+                                                            &h_alpha,
+                                                            dA,
+                                                            lda,
+                                                            stride_a,
+                                                            dx,
+                                                            incx,
+                                                            stride_x,
+                                                            &h_beta,
+                                                            dy_1,
+                                                            incy,
+                                                            stride_y,
+                                                            batch_count));
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device));
+        CHECK_ROCBLAS_ERROR(rocblas_gemv_strided_batched<T>(handle,
+                                                            transA,
+                                                            M,
+                                                            N,
+                                                            d_alpha,
+                                                            dA,
+                                                            lda,
+                                                            stride_a,
+                                                            dx,
+                                                            incx,
+                                                            stride_x,
+                                                            d_beta,
+                                                            dy_2,
+                                                            incy,
+                                                            stride_y,
+                                                            batch_count));
+
+        // copy output from device to CPU
+        CHECK_HIP_ERROR(hipMemcpy(hy_1, dy_1, sizeof(T) * size_y, hipMemcpyDeviceToHost));
+        CHECK_HIP_ERROR(hipMemcpy(hy_2, dy_2, sizeof(T) * size_y, hipMemcpyDeviceToHost));
+
+        // CPU BLAS
+        cpu_time_used = get_time_us();
+        for(int b = 0; b < batch_count; ++b)
+        {
+            cblas_gemv<T>(transA,
+                          M,
+                          N,
+                          h_alpha,
+                          hA + b * stride_a,
+                          lda,
+                          hx + b * stride_x,
+                          incx,
+                          h_beta,
+                          hy_gold + b * stride_y,
+                          incy);
+        }
+        cpu_time_used = get_time_us() - cpu_time_used;
+        cblas_gflops  = batch_count * gemv_gflop_count<T>(M, N) / cpu_time_used * 1e6;
+
+        if(arg.unit_check)
+        {
+            unit_check_general<T>(1, dim_y, batch_count, abs_incy, stride_y, hy_gold, hy_1);
+            unit_check_general<T>(1, dim_y, batch_count, abs_incy, stride_y, hy_gold, hy_2);
+        }
+
+        if(arg.norm_check)
+        {
+            rocblas_error_1 = norm_check_general<T>(
+                'F', 1, dim_y, abs_incy, stride_y, batch_count, hy_gold, hy_1);
+            rocblas_error_2 = norm_check_general<T>(
+                'F', 1, dim_y, abs_incy, stride_y, batch_count, hy_gold, hy_2);
+        }
+    }
+
+    if(arg.timing)
+    {
+        int number_cold_calls = 2;
+        int number_hot_calls  = 100;
+        CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host));
+
+        for(int iter = 0; iter < number_cold_calls; iter++)
+        {
+            rocblas_gemv_strided_batched<T>(handle,
+                                            transA,
+                                            M,
+                                            N,
+                                            &h_alpha,
+                                            dA,
+                                            lda,
+                                            stride_a,
+                                            dx,
+                                            incx,
+                                            stride_x,
+                                            &h_beta,
+                                            dy_1,
+                                            incy,
+                                            stride_y,
+                                            batch_count);
+        }
+
+        gpu_time_used = get_time_us(); // in microseconds
+
+        for(int iter = 0; iter < number_hot_calls; iter++)
+        {
+            rocblas_gemv_strided_batched<T>(handle,
+                                            transA,
+                                            M,
+                                            N,
+                                            &h_alpha,
+                                            dA,
+                                            lda,
+                                            stride_a,
+                                            dx,
+                                            incx,
+                                            stride_x,
+                                            &h_beta,
+                                            dy_1,
+                                            incy,
+                                            stride_y,
+                                            batch_count);
+        }
+
+        gpu_time_used     = (get_time_us() - gpu_time_used) / number_hot_calls;
+        rocblas_gflops    = batch_count * gemv_gflop_count<T>(M, N) / gpu_time_used * 1e6;
+        rocblas_bandwidth = batch_count * (1.0 * M * N) * sizeof(T) / gpu_time_used / 1e3;
+
+        // only norm_check return an norm error, unit check won't return anything
+        std::cout << "M,N,alpha,lda,stride_a,incx,stride_x,beta,incy,stride_y,batch_count,rocblas-"
+                     "Gflops,rocblas-GB/s,";
+        if(arg.norm_check)
+        {
+            std::cout << "CPU-Gflops,norm_error_host_ptr,norm_error_device_ptr";
+        }
+        std::cout << std::endl;
+
+        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << stride_a << "," << incx
+                  << "," << stride_x << "," << h_beta << "," << incy << "," << stride_y << ","
+                  << batch_count << "," << rocblas_gflops << "," << rocblas_bandwidth << ",";
+
+        if(arg.norm_check)
+        {
+            std::cout << cblas_gflops << ',';
+            std::cout << rocblas_error_1 << ',' << rocblas_error_2;
+        }
+
+        std::cout << std::endl;
+    }
+}
diff --git a/clients/include/unit.hpp b/clients/include/unit.hpp
index 7bc8fe0d8..8d5d52de2 100644
--- a/clients/include/unit.hpp
+++ b/clients/include/unit.hpp
@@ -14,9 +14,11 @@
 #include "rocblas.h"
 #include "rocblas_math.hpp"
 #include "rocblas_test.hpp"
+#include "rocblas_vector.hpp"
 
 #ifndef GOOGLE_TEST
 #define UNIT_CHECK(M, N, batch_count, lda, strideA, hCPU, hGPU, UNIT_ASSERT_EQ)
+#define UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, UNIT_ASSERT_EQ)
 #else
 // clang-format off
 #define UNIT_CHECK(M, N, batch_count, lda, strideA, hCPU, hGPU, UNIT_ASSERT_EQ)      \
@@ -32,6 +34,19 @@
                                        hGPU[i + j * lda + k * strideA]);             \
                     }                                                                \
     } while(0)
+#define UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, UNIT_ASSERT_EQ)             \
+    do                                                                               \
+    {                                                                                \
+        for(size_t k = 0; k < batch_count; k++)                                      \
+            for(size_t j = 0; j < N; j++)                                            \
+                for(size_t i = 0; i < M; i++)                                        \
+                    if (rocblas_isnan(hCPU[k][i + j * lda])) {                       \
+                        ASSERT_TRUE(rocblas_isnan(hGPU[k][i + j * lda]));            \
+                    } else {                                                         \
+                        UNIT_ASSERT_EQ(hCPU[k][i + j * lda],                         \
+                                       hGPU[k][i + j * lda]);                        \
+                    }                                                                \
+    } while(0)
 // clang-format on
 #endif
 
@@ -206,6 +221,80 @@ inline void unit_check_general(rocblas_int  M,
     UNIT_CHECK(M, N, batch_count, lda, strideA, hCPU, hGPU, ASSERT_EQ);
 }
 
+template <typename T>
+void unit_check_general(rocblas_int    M,
+                        rocblas_int    N,
+                        rocblas_int    batch_count,
+                        rocblas_int    lda,
+                        host_vector<T> hCPU[],
+                        host_vector<T> hGPU[]);
+
+template <>
+inline void unit_check_general(rocblas_int        M,
+                               rocblas_int        N,
+                               rocblas_int        batch_count,
+                               rocblas_int        lda,
+                               host_vector<rocblas_bfloat16> hCPU[],
+                               host_vector<rocblas_bfloat16> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_BFLOAT16_EQ);
+}
+
+template <>
+inline void unit_check_general(rocblas_int        M,
+                               rocblas_int        N,
+                               rocblas_int        batch_count,
+                               rocblas_int        lda,
+                               host_vector<rocblas_half> hCPU[],
+                               host_vector<rocblas_half> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_HALF_EQ);
+}
+
+template <>
+inline void unit_check_general(rocblas_int        M,
+                               rocblas_int        N,
+                               rocblas_int        batch_count,
+                               rocblas_int        lda,
+                               host_vector<float> hCPU[],
+                               host_vector<float> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_FLOAT_EQ);
+}
+
+template <>
+inline void unit_check_general(rocblas_int         M,
+                               rocblas_int         N,
+                               rocblas_int         batch_count,
+                               rocblas_int         lda,
+                               host_vector<double> hCPU[],
+                               host_vector<double> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_DOUBLE_EQ);
+}
+
+template <>
+inline void unit_check_general(rocblas_int         M,
+                               rocblas_int         N,
+                               rocblas_int         batch_count,
+                               rocblas_int         lda,
+                               host_vector<rocblas_float_complex> hCPU[],
+                               host_vector<rocblas_float_complex> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_FLOAT_COMPLEX_EQ);
+}
+
+template <>
+inline void unit_check_general(rocblas_int         M,
+                               rocblas_int         N,
+                               rocblas_int         batch_count,
+                               rocblas_int         lda,
+                               host_vector<rocblas_double_complex> hCPU[],
+                               host_vector<rocblas_double_complex> hGPU[])
+{
+    UNIT_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, ASSERT_DOUBLE_COMPLEX_EQ);
+}
+
 template <typename T>
 inline void trsm_err_res_check(T max_error, rocblas_int M, T forward_tolerance, T eps)
 {
diff --git a/library/src/blas2/gemv_device.hpp b/library/src/blas2/gemv_device.hpp
index 6e106c8c7..d2b973e6f 100644
--- a/library/src/blas2/gemv_device.hpp
+++ b/library/src/blas2/gemv_device.hpp
@@ -190,7 +190,6 @@ __device__ void gemvn_kernel(rocblas_int                   m,
     rocblas_int ty = thread_id / DIM_X;
 
     rocblas_int ind = hipBlockIdx_x * DIM_X + tx;
-    ;
 
     __shared__ rocblas_double_complex sdata[DIM_X * DIM_Y];
 
diff --git a/library/src/blas2/rocblas_gemv.cpp b/library/src/blas2/rocblas_gemv.cpp
index 17c1afb01..25aeaf9e4 100644
--- a/library/src/blas2/rocblas_gemv.cpp
+++ b/library/src/blas2/rocblas_gemv.cpp
@@ -2,7 +2,6 @@
  * Copyright 2016-2019 Advanced Micro Devices, Inc.
  * ************************************************************************ */
 #include "rocblas_gemv.hpp"
-#include "gemv_device.hpp"
 #include "handle.h"
 #include "logging.h"
 #include "rocblas.h"
diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp
index 2d72e1fa5..f02dda7d6 100644
--- a/library/src/blas2/rocblas_gemv.hpp
+++ b/library/src/blas2/rocblas_gemv.hpp
@@ -21,6 +21,7 @@ rocblas_status rocblas_gemv_template(rocblas_handle    handle,
                                      T*                y,
                                      rocblas_int       incy)
 {
+    //quick return
     if(!m || !n)
         return rocblas_status_success;
 
@@ -194,4 +195,368 @@ rocblas_status rocblas_gemv_template(rocblas_handle    handle,
     return rocblas_status_success;
 }
 
+template <typename T>
+rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
+                                        rocblas_operation transA,
+                                        rocblas_int       m,
+                                        rocblas_int       n,
+                                        const T*          alpha,
+                                        const T* const    A[],
+                                        rocblas_int       lda,
+                                        const T* const    x[],
+                                        rocblas_int       incx,
+                                        const T*          beta,
+                                        T* const          y[],
+                                        rocblas_int       incy,
+                                        rocblas_int       batch_count)
+{
+    // Quick return if possible. Not Argument error
+    if(!m || !n || !batch_count)
+        return rocblas_status_success;
+
+    hipStream_t rocblas_stream = handle->rocblas_stream;
+
+    if(transA == rocblas_operation_none)
+    {
+        // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s
+        static constexpr int GEMVN_DIM_X = 64;
+        static constexpr int GEMVN_DIM_Y = 16;
+        rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
+
+        dim3 gemvn_grid(blocks, batch_count);
+        dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
+                                   gemvn_grid,
+                                   gemvn_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   beta,
+                                   y,
+                                   incy);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
+                                   gemvn_grid,
+                                   gemvn_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   *beta,
+                                   y,
+                                   incy);
+        }
+    }
+    else if(transA == rocblas_operation_transpose)
+    {
+        // transpose
+        // number of columns on the y-dim of the grid
+        static constexpr int NB = 256;
+        dim3                 gemvt_grid(n, batch_count);
+        dim3                 gemvt_threads(NB);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
+                               gemvt_grid,
+                                   gemvt_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   beta,
+                                   y,
+                                   incy);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
+                                   gemvt_grid,
+                                   gemvt_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   *beta,
+                                   y,
+                                   incy);
+        }
+    }
+    else // conjugate transpose
+    {
+        // conjugate transpose
+        // number of columns on the y-dim of the grid
+        static constexpr int NB = 256;
+        dim3                 gemvc_grid(n, 1);
+        dim3                 gemvc_threads(NB);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
+                                   gemvc_grid,
+                                   gemvc_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   beta,
+                                   y,
+                                   incy);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
+                                   gemvc_grid,
+                                   gemvc_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   x,
+                                   incx,
+                                   *beta,
+                                   y,
+                                   incy);
+        }
+    }
+    
+    return rocblas_status_success;
+}
+
+
+template <typename T>
+rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
+                                                rocblas_operation transA,
+                                                rocblas_int       m,
+                                                rocblas_int       n,
+                                                const T*          alpha,
+                                                const T*          A,
+                                                rocblas_int       lda,
+                                                rocblas_int       strideA,
+                                                const T*          x,
+                                                rocblas_int       incx,
+                                                rocblas_int       stridex,
+                                                const T*          beta,
+                                                T*                y,
+                                                rocblas_int       incy,
+                                                rocblas_int       stridey,
+                                                rocblas_int       batch_count)
+{
+    // Quick return if possible. Not Argument error
+    if(!m || !n || !batch_count)
+        return rocblas_status_success;
+
+    hipStream_t rocblas_stream = handle->rocblas_stream;
+
+    if(transA == rocblas_operation_none)
+    {
+        // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s
+        static constexpr int GEMVN_DIM_X = 64;
+        static constexpr int GEMVN_DIM_Y = 16;
+        rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
+
+        dim3 gemvn_grid(blocks, batch_count);
+        dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
+                                   gemvn_grid,
+                                   gemvn_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
+                                   gemvn_grid,
+                                   gemvn_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   *beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+    }
+    else if(transA == rocblas_operation_transpose)
+    {
+        // transpose
+        // number of columns on the y-dim of the grid
+        static constexpr int NB = 256;
+        dim3                 gemvt_grid(n, batch_count);
+        dim3                 gemvt_threads(NB);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
+                                   gemvt_grid,
+                                   gemvt_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
+                                   gemvt_grid,
+                                   gemvt_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   *beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+    }
+    else // conjugate transpose
+    {
+        // conjugate transpose
+        // number of columns on the y-dim of the grid
+        static constexpr int NB = 256;
+        dim3                 gemvc_grid(n, 1);
+        dim3                 gemvc_threads(NB);
+
+        if(handle->pointer_mode == rocblas_pointer_mode_device)
+        {
+            hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
+                                   gemvc_grid,
+                                   gemvc_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+        else
+        {
+            if(!*alpha && *beta == 1)
+                return rocblas_status_success;
+
+            hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
+                                   gemvc_grid,
+                                   gemvc_threads,
+                                   0,
+                                   rocblas_stream,
+                                   m,
+                                   n,
+                                   *alpha,
+                                   A,
+                                   lda,
+                                   strideA,
+                                   x,
+                                   incx,
+                                   stridex,
+                                   *beta,
+                                   y,
+                                   incy,
+                                   stridey);
+        }
+    }
+    return rocblas_status_success;
+
+}
+
 #endif
diff --git a/library/src/blas2/rocblas_gemv_batched.cpp b/library/src/blas2/rocblas_gemv_batched.cpp
index 2ec2fc3c6..7c7005dac 100644
--- a/library/src/blas2/rocblas_gemv_batched.cpp
+++ b/library/src/blas2/rocblas_gemv_batched.cpp
@@ -1,7 +1,7 @@
 /* ************************************************************************
  * Copyright 2016-2019 Advanced Micro Devices, Inc.
  * ************************************************************************ */
-#include "gemv_device.hpp"
+#include "rocblas_gemv.hpp"
 #include "handle.h"
 #include "logging.h"
 #include "rocblas.h"
@@ -21,7 +21,7 @@ namespace
     constexpr char rocblas_gemv_name<rocblas_double_complex>[] = "rocblas_zgemv_batched";
 
     template <typename T>
-    rocblas_status rocblas_gemv_batched(rocblas_handle    handle,
+    rocblas_status rocblas_gemv_batched_impl(rocblas_handle    handle,
                                         rocblas_operation transA,
                                         rocblas_int       m,
                                         rocblas_int       n,
@@ -90,7 +90,7 @@ namespace
                               *beta,
                               "--incy",
                               incy,
-                              "--batch_count",
+                              "--batch",
                               batch_count);
             }
             else
@@ -127,7 +127,7 @@ namespace
                             incx,
                             "incy",
                             incy,
-                            "batch_count",
+                            "batch",
                             batch_count);
         }
 
@@ -135,163 +135,14 @@ namespace
             return rocblas_status_invalid_pointer;
         if(m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy)
             return rocblas_status_invalid_size;
-        // Quick return if possible. Not Argument error
-        if(!m || !n)
-            return rocblas_status_success;
-
-        hipStream_t rocblas_stream = handle->rocblas_stream;
-
-        if(transA == rocblas_operation_none)
-        {
-            // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s
-            static constexpr int GEMVN_DIM_X = 64;
-            static constexpr int GEMVN_DIM_Y = 16;
-            rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
-
-            dim3 gemvn_grid(blocks, batch_count);
-            dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
-
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
-
-                hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
-            }
-        }
-        else if(transA == rocblas_operation_transpose)
-        {
-            // transpose
-            // number of columns on the y-dim of the grid
-            static constexpr int NB = 256;
-            dim3                 gemvt_grid(n, batch_count);
-            dim3                 gemvt_threads(NB);
-
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
-
-                hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
-            }
-        }
-        else // conjugate transpose
-        {
-            // conjugate transpose
-            // number of columns on the y-dim of the grid
-            static constexpr int NB = 256;
-            dim3                 gemvc_grid(n, 1);
-            dim3                 gemvc_threads(NB);
-
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
+        if(batch_count < 0)
+            return rocblas_status_invalid_size;
 
-                hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
-            }
-        }
-        return rocblas_status_success;
+        return rocblas_gemv_batched_template(handle,transA,m,n,alpha,A,lda,x,incx,beta,y,incy,batch_count);
     }
-
 } // namespace
 
+
 /*
  * ===========================================================================
  *    C wrapper
@@ -314,7 +165,7 @@ rocblas_status rocblas_sgemv_batched(rocblas_handle     handle,
                                      rocblas_int        incy,
                                      rocblas_int        batch_count)
 {
-    return rocblas_gemv_batched(
+    return rocblas_gemv_batched_impl(
         handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
 }
 
@@ -332,7 +183,7 @@ rocblas_status rocblas_dgemv_batched(rocblas_handle      handle,
                                      rocblas_int         incy,
                                      rocblas_int         batch_count)
 {
-    return rocblas_gemv_batched(
+    return rocblas_gemv_batched_impl(
         handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
 }
 
@@ -350,7 +201,7 @@ rocblas_status rocblas_cgemv_batched(rocblas_handle                     handle,
                                      rocblas_int                        incy,
                                      rocblas_int                        batch_count)
 {
-    return rocblas_gemv_batched(
+    return rocblas_gemv_batched_impl(
         handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
 }
 
@@ -368,7 +219,7 @@ rocblas_status rocblas_zgemv_batched(rocblas_handle                      handle,
                                      rocblas_int                         incy,
                                      rocblas_int                         batch_count)
 {
-    return rocblas_gemv_batched(
+    return rocblas_gemv_batched_impl(
         handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
 }
 
diff --git a/library/src/blas2/rocblas_gemv_strided_batched.cpp b/library/src/blas2/rocblas_gemv_strided_batched.cpp
index da3d4716c..6238b5ea0 100644
--- a/library/src/blas2/rocblas_gemv_strided_batched.cpp
+++ b/library/src/blas2/rocblas_gemv_strided_batched.cpp
@@ -1,7 +1,7 @@
 /* ************************************************************************
  * Copyright 2016-2019 Advanced Micro Devices, Inc.
  * ************************************************************************ */
-#include "gemv_device.hpp"
+#include "rocblas_gemv.hpp"
 #include "handle.h"
 #include "logging.h"
 #include "rocblas.h"
@@ -21,7 +21,7 @@ namespace
     constexpr char rocblas_gemv_name<rocblas_double_complex>[] = "rocblas_zgemv_strided_batched";
 
     template <typename T>
-    rocblas_status rocblas_gemv_strided_batched(rocblas_handle    handle,
+    rocblas_status rocblas_gemv_strided_batched_impl(rocblas_handle    handle,
                                                 rocblas_operation transA,
                                                 rocblas_int       m,
                                                 rocblas_int       n,
@@ -90,19 +90,19 @@ namespace
                                   : "",
                               "--lda",
                               lda,
-                              "--strideA",
+                              "--stride_a",
                               strideA,
                               "--incx",
                               incx,
-                              "--stridex",
+                              "--stride_x",
                               stridex,
                               "--beta",
                               *beta,
                               "--incy",
                               incy,
-                              "--stridey",
+                              "--stride_y",
                               stridey,
-                              "--batch_count",
+                              "--batch",
                               batch_count);
             }
             else
@@ -138,17 +138,17 @@ namespace
                             n,
                             "lda",
                             lda,
-                            "strideA",
+                            "stride_a",
                             strideA,
                             "incx",
                             incx,
-                            "stridex",
+                            "stride_x",
                             stridex,
                             "incy",
                             incy,
-                            "stridey",
+                            "stride_y",
                             stridey,
-                            "batch_count",
+                            "batch",
                             batch_count);
         }
 
@@ -156,181 +156,45 @@ namespace
             return rocblas_status_invalid_pointer;
         if(m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy)
             return rocblas_status_invalid_size;
+        if(strideA < lda * n)
+            return rocblas_status_invalid_size;
+        if(batch_count < 0)
+            return rocblas_status_invalid_size;
 
-        // Quick return if possible. Not Argument error
-        if(!m || !n)
-            return rocblas_status_success;
-
-        hipStream_t rocblas_stream = handle->rocblas_stream;
+        size_t size_x, dim_x, abs_incx;
+        size_t size_y, dim_y, abs_incy;
 
         if(transA == rocblas_operation_none)
         {
-            // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s
-            static constexpr int GEMVN_DIM_X = 64;
-            static constexpr int GEMVN_DIM_Y = 16;
-            rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
-
-            dim3 gemvn_grid(blocks, batch_count);
-            dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
-
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
-
-                hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
+            dim_x = n;
+            dim_y = m;
         }
-        else if(transA == rocblas_operation_transpose)
+        else
         {
-            // transpose
-            // number of columns on the y-dim of the grid
-            static constexpr int NB = 256;
-            dim3                 gemvt_grid(n, batch_count);
-            dim3                 gemvt_threads(NB);
+            dim_x = m;
+            dim_y = n;
+        }
 
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
+        abs_incx = incx >= 0 ? incx : -incx;
+        abs_incy = incy >= 0 ? incy : -incy;
 
-                hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
-        }
-        else // conjugate transpose
-        {
-            // conjugate transpose
-            // number of columns on the y-dim of the grid
-            static constexpr int NB = 256;
-            dim3                 gemvc_grid(n, 1);
-            dim3                 gemvc_threads(NB);
+        size_x = dim_x * abs_incx;
+        size_y = dim_y * abs_incy;
 
-            if(handle->pointer_mode == rocblas_pointer_mode_device)
-            {
-                hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
-            else
-            {
-                if(!*alpha && *beta == 1)
-                    return rocblas_status_success;
+        if(stridex < size_x || stridey < size_y)
+            return rocblas_status_invalid_size;
 
-                hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
-            }
-        }
-        return rocblas_status_success;
+        return rocblas_gemv_strided_batched_template(handle,
+                                                    transA,
+                                                    m, n, alpha,
+                                                    A, lda, strideA,
+                                                    x, incx, stridex,
+                                                    beta,
+                                                    y, incy, stridey,
+                                                    batch_count);
     }
+} //namespace
 
-} // namespace
 
 /*
 * ===========================================================================
@@ -357,7 +221,7 @@ rocblas_status rocblas_sgemv_strided_batched(rocblas_handle    handle,
                                              rocblas_int       stridey,
                                              rocblas_int       batch_count)
 {
-    return rocblas_gemv_strided_batched(handle,
+    return rocblas_gemv_strided_batched_impl(handle,
                                         transA,
                                         m,
                                         n,
@@ -392,7 +256,7 @@ rocblas_status rocblas_dgemv_strided_batched(rocblas_handle    handle,
                                              rocblas_int       stridey,
                                              rocblas_int       batch_count)
 {
-    return rocblas_gemv_strided_batched(handle,
+    return rocblas_gemv_strided_batched_impl(handle,
                                         transA,
                                         m,
                                         n,
@@ -427,7 +291,7 @@ rocblas_status rocblas_cgemv_strided_batched(rocblas_handle               handle
                                              rocblas_int                  stridey,
                                              rocblas_int                  batch_count)
 {
-    return rocblas_gemv_strided_batched(handle,
+    return rocblas_gemv_strided_batched_impl(handle,
                                         transA,
                                         m,
                                         n,
@@ -462,7 +326,7 @@ rocblas_status rocblas_zgemv_strided_batched(rocblas_handle                handl
                                              rocblas_int                   stridey,
                                              rocblas_int                   batch_count)
 {
-    return rocblas_gemv_strided_batched(handle,
+    return rocblas_gemv_strided_batched_impl(handle,
                                         transA,
                                         m,
                                         n,

From e802ddee4a891931fc5a15ad734b31c913478cf3 Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Fri, 9 Aug 2019 12:34:15 -0600
Subject: [PATCH 2/5] Clang formatting

---
 clients/common/norm.cpp                       |  64 +--
 clients/include/norm.hpp                      |   1 -
 clients/include/rocblas.hpp                   |   6 +-
 clients/include/rocblas_vector.hpp            |  74 +--
 clients/include/testing_gemv.hpp              |   4 +-
 clients/include/testing_gemv_batched.hpp      |  31 +-
 clients/include/unit.hpp                      |  32 +-
 library/src/blas2/rocblas_gemv.hpp            | 428 +++++++++---------
 library/src/blas2/rocblas_gemv_batched.cpp    |  30 +-
 .../blas2/rocblas_gemv_strided_batched.cpp    | 175 +++----
 10 files changed, 432 insertions(+), 413 deletions(-)

diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp
index e7760d42a..cdcc5fb6a 100644
--- a/clients/common/norm.cpp
+++ b/clients/common/norm.cpp
@@ -506,14 +506,14 @@ double norm_check_general<double>(char        norm_type,
 }
 
 template <>
-double norm_check_general<rocblas_float_complex>(char        norm_type,
-                                  rocblas_int M,
-                                  rocblas_int N,
-                                  rocblas_int lda,
-                                  rocblas_int stride_a,
-                                  rocblas_int batch_count,
-                                  rocblas_float_complex*     hCPU,
-                                  rocblas_float_complex*     hGPU)
+double norm_check_general<rocblas_float_complex>(char                   norm_type,
+                                                 rocblas_int            M,
+                                                 rocblas_int            N,
+                                                 rocblas_int            lda,
+                                                 rocblas_int            stride_a,
+                                                 rocblas_int            batch_count,
+                                                 rocblas_float_complex* hCPU,
+                                                 rocblas_float_complex* hGPU)
 {
     // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
     // one norm is max column sum
@@ -523,9 +523,9 @@ double norm_check_general<rocblas_float_complex>(char        norm_type,
     // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
     // of strided batched matrix
 
-    float      work;
+    float       work;
     rocblas_int incx  = 1;
-    float      alpha = -1.0f;
+    float       alpha = -1.0f;
     rocblas_int size  = lda * N;
 
     double cumulative_error = 0.0;
@@ -552,14 +552,14 @@ double norm_check_general<rocblas_float_complex>(char        norm_type,
 }
 
 template <>
-double norm_check_general<rocblas_double_complex>(char        norm_type,
-                                  rocblas_int M,
-                                  rocblas_int N,
-                                  rocblas_int lda,
-                                  rocblas_int stride_a,
-                                  rocblas_int batch_count,
-                                  rocblas_double_complex*     hCPU,
-                                  rocblas_double_complex*     hGPU)
+double norm_check_general<rocblas_double_complex>(char                    norm_type,
+                                                  rocblas_int             M,
+                                                  rocblas_int             N,
+                                                  rocblas_int             lda,
+                                                  rocblas_int             stride_a,
+                                                  rocblas_int             batch_count,
+                                                  rocblas_double_complex* hCPU,
+                                                  rocblas_double_complex* hGPU)
 {
     // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
     // one norm is max column sum
@@ -689,13 +689,13 @@ double norm_check_general<double>(char                norm_type,
 }
 
 template <>
-double norm_check_general<rocblas_float_complex>(char               norm_type,
-                                 rocblas_int        M,
-                                 rocblas_int        N,
-                                 rocblas_int        lda,
-                                 rocblas_int        batch_count,
-                                 host_vector<rocblas_float_complex> hCPU[],
-                                 host_vector<rocblas_float_complex> hGPU[])
+double norm_check_general<rocblas_float_complex>(char                               norm_type,
+                                                 rocblas_int                        M,
+                                                 rocblas_int                        N,
+                                                 rocblas_int                        lda,
+                                                 rocblas_int                        batch_count,
+                                                 host_vector<rocblas_float_complex> hCPU[],
+                                                 host_vector<rocblas_float_complex> hGPU[])
 {
     // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
     // one norm is max column sum
@@ -734,13 +734,13 @@ double norm_check_general<rocblas_float_complex>(char               norm_type,
 }
 
 template <>
-double norm_check_general<rocblas_double_complex>(char                norm_type,
-                                  rocblas_int         M,
-                                  rocblas_int         N,
-                                  rocblas_int         lda,
-                                  rocblas_int         batch_count,
-                                  host_vector<rocblas_double_complex> hCPU[],
-                                  host_vector<rocblas_double_complex> hGPU[])
+double norm_check_general<rocblas_double_complex>(char                                norm_type,
+                                                  rocblas_int                         M,
+                                                  rocblas_int                         N,
+                                                  rocblas_int                         lda,
+                                                  rocblas_int                         batch_count,
+                                                  host_vector<rocblas_double_complex> hCPU[],
+                                                  host_vector<rocblas_double_complex> hGPU[])
 {
     // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
     // one norm is max column sum
diff --git a/clients/include/norm.hpp b/clients/include/norm.hpp
index 7a38e678d..d81f8c4c8 100644
--- a/clients/include/norm.hpp
+++ b/clients/include/norm.hpp
@@ -50,7 +50,6 @@ double norm_check_general(char           norm_type,
                           host_vector<T> hCPU[],
                           host_vector<T> hGPU[]);
 
-
 /*! \brief  Template: norm check for hermitian/symmetric Matrix: half/float/double/complex */
 template <typename T>
 double norm_check_symmetric(
diff --git a/clients/include/rocblas.hpp b/clients/include/rocblas.hpp
index 662ced891..1f4a890b1 100644
--- a/clients/include/rocblas.hpp
+++ b/clients/include/rocblas.hpp
@@ -357,10 +357,12 @@ template <>
 static constexpr auto rocblas_gemv_strided_batched<double> = rocblas_dgemv_strided_batched;
 
 template <>
-static constexpr auto rocblas_gemv_strided_batched<rocblas_float_complex> = rocblas_cgemv_strided_batched;
+static constexpr auto
+    rocblas_gemv_strided_batched<rocblas_float_complex> = rocblas_cgemv_strided_batched;
 
 template <>
-static constexpr auto rocblas_gemv_strided_batched<rocblas_double_complex> = rocblas_zgemv_strided_batched;
+static constexpr auto
+    rocblas_gemv_strided_batched<rocblas_double_complex> = rocblas_zgemv_strided_batched;
 
 // gemv_batched
 template <typename T>
diff --git a/clients/include/rocblas_vector.hpp b/clients/include/rocblas_vector.hpp
index c4efabb38..ed9e89309 100644
--- a/clients/include/rocblas_vector.hpp
+++ b/clients/include/rocblas_vector.hpp
@@ -15,23 +15,31 @@
 
 /* ============================================================================================ */
 /*! \brief  base-class to allocate/deallocate device memory */
-template<typename T, size_t PAD, typename U>
+template <typename T, size_t PAD, typename U>
 class d_vector
 {
 protected:
     size_t size, bytes;
 
-    #ifdef GOOGLE_TEST
+#ifdef GOOGLE_TEST
     U guard[PAD];
-    d_vector(size_t s) : size(s), bytes((s + PAD*2) * sizeof(T)) {
+    d_vector(size_t s)
+        : size(s)
+        , bytes((s + PAD * 2) * sizeof(T))
+    {
         // Initialize guard with random data
-        if (PAD > 0) {
+        if(PAD > 0)
+        {
             rocblas_init_nan(guard, PAD);
         }
     }
-    #else
-    d_vector(size_t s) : size(s), bytes(s ? s * sizeof(T) : sizeof(T)) {}
-    #endif
+#else
+    d_vector(size_t s)
+        : size(s)
+        , bytes(s ? s * sizeof(T) : sizeof(T))
+    {
+    }
+#endif
 
     T* device_vector_setup()
     {
@@ -42,10 +50,11 @@ class d_vector
             fprintf(stderr, "Error allocating %'zu bytes (%zu GB)\n", bytes, bytes >> 30);
             d = nullptr;
         }
-        #ifdef GOOGLE_TEST
+#ifdef GOOGLE_TEST
         else
         {
-            if (PAD > 0) {
+            if(PAD > 0)
+            {
                 // Copy guard to device memory before allocated memory
                 hipMemcpy(d, guard, sizeof(guard), hipMemcpyHostToDevice);
 
@@ -56,7 +65,7 @@ class d_vector
                 hipMemcpy(d + size, guard, sizeof(guard), hipMemcpyHostToDevice);
             }
         }
-        #endif
+#endif
         return d;
     }
 
@@ -64,8 +73,9 @@ class d_vector
     {
         if(d != nullptr)
         {
-            #ifdef GOOGLE_TEST
-            if (PAD > 0) {
+#ifdef GOOGLE_TEST
+            if(PAD > 0)
+            {
                 U host[PAD];
 
                 // Copy device memory after allocated memory to host
@@ -83,7 +93,7 @@ class d_vector
                 // Make sure no corruption has occurred
                 EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0);
             }
-            #endif
+#endif
             // Free device memory
             CHECK_HIP_ERROR((hipFree)(d));
         }
@@ -93,53 +103,57 @@ class d_vector
 /* ============================================================================================ */
 /*! \brief  pseudo-vector subclass which uses a batch of device memory pointers and 
             an array of pointers in host memory*/
-template<typename T, size_t PAD = 4096, typename U = T>
-class device_batch_vector : private d_vector<T,PAD,U>
+template <typename T, size_t PAD = 4096, typename U = T>
+class device_batch_vector : private d_vector<T, PAD, U>
 {
 public:
-    explicit device_batch_vector(size_t b, size_t s) : batch(b), d_vector<T,PAD,U>(s) 
+    explicit device_batch_vector(size_t b, size_t s)
+        : batch(b)
+        , d_vector<T, PAD, U>(s)
     {
-        data = (T**) malloc(batch*sizeof(T*));
-        for(int b=0;b<batch;++b)
+        data = (T**)malloc(batch * sizeof(T*));
+        for(int b = 0; b < batch; ++b)
             data[b] = this->device_vector_setup();
     }
-    
+
     ~device_batch_vector()
     {
-        if(data != nullptr) {
-            for(int b=0;b<batch;++b)
+        if(data != nullptr)
+        {
+            for(int b = 0; b < batch; ++b)
                 this->device_vector_teardown(data[b]);
             free(data);
         }
     }
-    
-    T* operator [](int n)
+
+    T* operator[](int n)
     {
         return data[n];
     }
-    
+
     operator T**()
     {
         return data;
     }
-    
+
     // Disallow copying or assigning
     device_batch_vector(const device_batch_vector&) = delete;
     device_batch_vector& operator=(const device_batch_vector&) = delete;
 
 private:
-    T** data;
+    T**    data;
     size_t batch;
 };
 
 /* ============================================================================================ */
 /*! \brief  pseudo-vector subclass which uses device memory */
 template <typename T, size_t PAD = 4096, typename U = T>
-class device_vector : private d_vector<T,PAD,U>
+class device_vector : private d_vector<T, PAD, U>
 {
 public:
     // Must wrap constructor and destructor in functions to allow Google Test macros to work
-    explicit device_vector(size_t s) : d_vector<T,PAD,U>(s) 
+    explicit device_vector(size_t s)
+        : d_vector<T, PAD, U>(s)
     {
         data = this->device_vector_setup();
     }
@@ -154,7 +168,7 @@ class device_vector : private d_vector<T,PAD,U>
     {
         return data;
     }
-    
+
     operator const T*() const
     {
         return data;
@@ -171,7 +185,7 @@ class device_vector : private d_vector<T,PAD,U>
     device_vector& operator=(const device_vector&) = delete;
 
 private:
-    T*           data;
+    T* data;
 };
 
 /* ============================================================================================ */
diff --git a/clients/include/testing_gemv.hpp b/clients/include/testing_gemv.hpp
index a48958d1e..aa7fc6e00 100644
--- a/clients/include/testing_gemv.hpp
+++ b/clients/include/testing_gemv.hpp
@@ -261,8 +261,8 @@ void testing_gemv(const Arguments& arg)
         }
         std::cout << std::endl;
 
-        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << incx << "," << h_beta << "," << incy
-                  << "," << rocblas_gflops << "," << rocblas_bandwidth << ",";
+        std::cout << M << "," << N << "," << h_alpha << "," << lda << "," << incx << "," << h_beta
+                  << "," << incy << "," << rocblas_gflops << "," << rocblas_bandwidth << ",";
 
         if(arg.norm_check)
         {
diff --git a/clients/include/testing_gemv_batched.hpp b/clients/include/testing_gemv_batched.hpp
index 3f4f914a7..cdf6cb262 100644
--- a/clients/include/testing_gemv_batched.hpp
+++ b/clients/include/testing_gemv_batched.hpp
@@ -33,10 +33,10 @@ void testing_gemv_batched_bad_arg(const Arguments& arg)
     rocblas_local_handle handle;
 
     // allocate memory on device
-    device_vector<T*,0,T> dA(batch_count);
-    device_vector<T*,0,T> dx(batch_count);
-    device_vector<T*,0,T> dy(batch_count);
-    
+    device_vector<T*, 0, T> dA(batch_count);
+    device_vector<T*, 0, T> dx(batch_count);
+    device_vector<T*, 0, T> dy(batch_count);
+
     if(!dA || !dx || !dy)
     {
         CHECK_HIP_ERROR(hipErrorOutOfMemory);
@@ -92,9 +92,9 @@ void testing_gemv_batched(const Arguments& arg)
     // argument sanity check before allocating invalid memory
     if(M < 0 || N < 0 || lda < M || lda < 1 || !incx || !incy || batch_count < 0)
     {
-        device_vector<T*,0,T> dAA1(1);
-        device_vector<T*,0,T> dxA1(1);
-        device_vector<T*,0,T> dy_1A1(1);
+        device_vector<T*, 0, T> dAA1(1);
+        device_vector<T*, 0, T> dxA1(1);
+        device_vector<T*, 0, T> dy_1A1(1);
 
         if(!dAA1 || !dxA1 || !dy_1A1)
         {
@@ -125,10 +125,10 @@ void testing_gemv_batched(const Arguments& arg)
         return;
 
     //Device-arrays of pointers to device memory
-    device_vector<T*,0,T> dAA(batch_count);
-    device_vector<T*,0,T> dxA(batch_count);
-    device_vector<T*,0,T> dy_1A(batch_count);
-    device_vector<T*,0,T> dy_2A(batch_count);
+    device_vector<T*, 0, T> dAA(batch_count);
+    device_vector<T*, 0, T> dxA(batch_count);
+    device_vector<T*, 0, T> dy_1A(batch_count);
+    device_vector<T*, 0, T> dy_2A(batch_count);
 
     if(!dAA || !dxA || !dy_1A || !dy_2A)
     {
@@ -174,10 +174,10 @@ void testing_gemv_batched(const Arguments& arg)
 
     // Host-arrays of pointers to device memory
     // (intermediate arrays used for the transfers)
-    device_batch_vector<T> AA(batch_count,size_A);
-    device_batch_vector<T> xA(batch_count,size_x);
-    device_batch_vector<T> y_1A(batch_count,size_y);
-    device_batch_vector<T> y_2A(batch_count,size_y);
+    device_batch_vector<T> AA(batch_count, size_A);
+    device_batch_vector<T> xA(batch_count, size_x);
+    device_batch_vector<T> y_1A(batch_count, size_y);
+    device_batch_vector<T> y_2A(batch_count, size_y);
 
     device_vector<T> d_alpha(1);
     device_vector<T> d_beta(1);
@@ -356,5 +356,4 @@ void testing_gemv_batched(const Arguments& arg)
 
         std::cout << std::endl;
     }
-
 }
diff --git a/clients/include/unit.hpp b/clients/include/unit.hpp
index 8d5d52de2..92cc84188 100644
--- a/clients/include/unit.hpp
+++ b/clients/include/unit.hpp
@@ -230,10 +230,10 @@ void unit_check_general(rocblas_int    M,
                         host_vector<T> hGPU[]);
 
 template <>
-inline void unit_check_general(rocblas_int        M,
-                               rocblas_int        N,
-                               rocblas_int        batch_count,
-                               rocblas_int        lda,
+inline void unit_check_general(rocblas_int                   M,
+                               rocblas_int                   N,
+                               rocblas_int                   batch_count,
+                               rocblas_int                   lda,
                                host_vector<rocblas_bfloat16> hCPU[],
                                host_vector<rocblas_bfloat16> hGPU[])
 {
@@ -241,10 +241,10 @@ inline void unit_check_general(rocblas_int        M,
 }
 
 template <>
-inline void unit_check_general(rocblas_int        M,
-                               rocblas_int        N,
-                               rocblas_int        batch_count,
-                               rocblas_int        lda,
+inline void unit_check_general(rocblas_int               M,
+                               rocblas_int               N,
+                               rocblas_int               batch_count,
+                               rocblas_int               lda,
                                host_vector<rocblas_half> hCPU[],
                                host_vector<rocblas_half> hGPU[])
 {
@@ -274,10 +274,10 @@ inline void unit_check_general(rocblas_int         M,
 }
 
 template <>
-inline void unit_check_general(rocblas_int         M,
-                               rocblas_int         N,
-                               rocblas_int         batch_count,
-                               rocblas_int         lda,
+inline void unit_check_general(rocblas_int                        M,
+                               rocblas_int                        N,
+                               rocblas_int                        batch_count,
+                               rocblas_int                        lda,
                                host_vector<rocblas_float_complex> hCPU[],
                                host_vector<rocblas_float_complex> hGPU[])
 {
@@ -285,10 +285,10 @@ inline void unit_check_general(rocblas_int         M,
 }
 
 template <>
-inline void unit_check_general(rocblas_int         M,
-                               rocblas_int         N,
-                               rocblas_int         batch_count,
-                               rocblas_int         lda,
+inline void unit_check_general(rocblas_int                         M,
+                               rocblas_int                         N,
+                               rocblas_int                         batch_count,
+                               rocblas_int                         lda,
                                host_vector<rocblas_double_complex> hCPU[],
                                host_vector<rocblas_double_complex> hGPU[])
 {
diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp
index f02dda7d6..4c3a9c57b 100644
--- a/library/src/blas2/rocblas_gemv.hpp
+++ b/library/src/blas2/rocblas_gemv.hpp
@@ -197,18 +197,18 @@ rocblas_status rocblas_gemv_template(rocblas_handle    handle,
 
 template <typename T>
 rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
-                                        rocblas_operation transA,
-                                        rocblas_int       m,
-                                        rocblas_int       n,
-                                        const T*          alpha,
-                                        const T* const    A[],
-                                        rocblas_int       lda,
-                                        const T* const    x[],
-                                        rocblas_int       incx,
-                                        const T*          beta,
-                                        T* const          y[],
-                                        rocblas_int       incy,
-                                        rocblas_int       batch_count)
+                                             rocblas_operation transA,
+                                             rocblas_int       m,
+                                             rocblas_int       n,
+                                             const T*          alpha,
+                                             const T* const    A[],
+                                             rocblas_int       lda,
+                                             const T* const    x[],
+                                             rocblas_int       incx,
+                                             const T*          beta,
+                                             T* const          y[],
+                                             rocblas_int       incy,
+                                             rocblas_int       batch_count)
 {
     // Quick return if possible. Not Argument error
     if(!m || !n || !batch_count)
@@ -229,20 +229,20 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
         if(handle->pointer_mode == rocblas_pointer_mode_device)
         {
             hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
+                               gemvn_grid,
+                               gemvn_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               beta,
+                               y,
+                               incy);
         }
         else
         {
@@ -250,20 +250,20 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL((gemvn_kernel_batched<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
+                               gemvn_grid,
+                               gemvn_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               *beta,
+                               y,
+                               incy);
         }
     }
     else if(transA == rocblas_operation_transpose)
@@ -278,19 +278,19 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
         {
             hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
                                gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
+                               gemvt_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               beta,
+                               y,
+                               incy);
         }
         else
         {
@@ -298,20 +298,20 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL(gemvt_kernel_batched<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
+                               gemvt_grid,
+                               gemvt_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               *beta,
+                               y,
+                               incy);
         }
     }
     else // conjugate transpose
@@ -325,20 +325,20 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
         if(handle->pointer_mode == rocblas_pointer_mode_device)
         {
             hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   beta,
-                                   y,
-                                   incy);
+                               gemvc_grid,
+                               gemvc_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               beta,
+                               y,
+                               incy);
         }
         else
         {
@@ -346,44 +346,43 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL(gemvc_kernel_batched<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   x,
-                                   incx,
-                                   *beta,
-                                   y,
-                                   incy);
+                               gemvc_grid,
+                               gemvc_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               x,
+                               incx,
+                               *beta,
+                               y,
+                               incy);
         }
     }
-    
+
     return rocblas_status_success;
 }
 
-
 template <typename T>
 rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
-                                                rocblas_operation transA,
-                                                rocblas_int       m,
-                                                rocblas_int       n,
-                                                const T*          alpha,
-                                                const T*          A,
-                                                rocblas_int       lda,
-                                                rocblas_int       strideA,
-                                                const T*          x,
-                                                rocblas_int       incx,
-                                                rocblas_int       stridex,
-                                                const T*          beta,
-                                                T*                y,
-                                                rocblas_int       incy,
-                                                rocblas_int       stridey,
-                                                rocblas_int       batch_count)
+                                                     rocblas_operation transA,
+                                                     rocblas_int       m,
+                                                     rocblas_int       n,
+                                                     const T*          alpha,
+                                                     const T*          A,
+                                                     rocblas_int       lda,
+                                                     rocblas_int       strideA,
+                                                     const T*          x,
+                                                     rocblas_int       incx,
+                                                     rocblas_int       stridex,
+                                                     const T*          beta,
+                                                     T*                y,
+                                                     rocblas_int       incy,
+                                                     rocblas_int       stridey,
+                                                     rocblas_int       batch_count)
 {
     // Quick return if possible. Not Argument error
     if(!m || !n || !batch_count)
@@ -404,23 +403,23 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
         if(handle->pointer_mode == rocblas_pointer_mode_device)
         {
             hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvn_grid,
+                               gemvn_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               beta,
+                               y,
+                               incy,
+                               stridey);
         }
         else
         {
@@ -428,23 +427,23 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL((gemvn_kernel_strided<GEMVN_DIM_X, GEMVN_DIM_Y>),
-                                   gemvn_grid,
-                                   gemvn_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvn_grid,
+                               gemvn_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               *beta,
+                               y,
+                               incy,
+                               stridey);
         }
     }
     else if(transA == rocblas_operation_transpose)
@@ -458,23 +457,23 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
         if(handle->pointer_mode == rocblas_pointer_mode_device)
         {
             hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvt_grid,
+                               gemvt_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               beta,
+                               y,
+                               incy,
+                               stridey);
         }
         else
         {
@@ -482,23 +481,23 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL(gemvt_kernel_strided<NB>,
-                                   gemvt_grid,
-                                   gemvt_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvt_grid,
+                               gemvt_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               *beta,
+                               y,
+                               incy,
+                               stridey);
         }
     }
     else // conjugate transpose
@@ -512,23 +511,23 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
         if(handle->pointer_mode == rocblas_pointer_mode_device)
         {
             hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvc_grid,
+                               gemvc_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               beta,
+                               y,
+                               incy,
+                               stridey);
         }
         else
         {
@@ -536,27 +535,26 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
                 return rocblas_status_success;
 
             hipLaunchKernelGGL(gemvc_kernel_strided<NB>,
-                                   gemvc_grid,
-                                   gemvc_threads,
-                                   0,
-                                   rocblas_stream,
-                                   m,
-                                   n,
-                                   *alpha,
-                                   A,
-                                   lda,
-                                   strideA,
-                                   x,
-                                   incx,
-                                   stridex,
-                                   *beta,
-                                   y,
-                                   incy,
-                                   stridey);
+                               gemvc_grid,
+                               gemvc_threads,
+                               0,
+                               rocblas_stream,
+                               m,
+                               n,
+                               *alpha,
+                               A,
+                               lda,
+                               strideA,
+                               x,
+                               incx,
+                               stridex,
+                               *beta,
+                               y,
+                               incy,
+                               stridey);
         }
     }
     return rocblas_status_success;
-
 }
 
 #endif
diff --git a/library/src/blas2/rocblas_gemv_batched.cpp b/library/src/blas2/rocblas_gemv_batched.cpp
index 7c7005dac..c0af67b88 100644
--- a/library/src/blas2/rocblas_gemv_batched.cpp
+++ b/library/src/blas2/rocblas_gemv_batched.cpp
@@ -1,10 +1,10 @@
 /* ************************************************************************
  * Copyright 2016-2019 Advanced Micro Devices, Inc.
  * ************************************************************************ */
-#include "rocblas_gemv.hpp"
 #include "handle.h"
 #include "logging.h"
 #include "rocblas.h"
+#include "rocblas_gemv.hpp"
 #include "utility.h"
 
 namespace
@@ -22,18 +22,18 @@ namespace
 
     template <typename T>
     rocblas_status rocblas_gemv_batched_impl(rocblas_handle    handle,
-                                        rocblas_operation transA,
-                                        rocblas_int       m,
-                                        rocblas_int       n,
-                                        const T*          alpha,
-                                        const T* const    A[],
-                                        rocblas_int       lda,
-                                        const T* const    x[],
-                                        rocblas_int       incx,
-                                        const T*          beta,
-                                        T* const          y[],
-                                        rocblas_int       incy,
-                                        rocblas_int       batch_count)
+                                             rocblas_operation transA,
+                                             rocblas_int       m,
+                                             rocblas_int       n,
+                                             const T*          alpha,
+                                             const T* const    A[],
+                                             rocblas_int       lda,
+                                             const T* const    x[],
+                                             rocblas_int       incx,
+                                             const T*          beta,
+                                             T* const          y[],
+                                             rocblas_int       incy,
+                                             rocblas_int       batch_count)
     {
         if(!handle)
             return rocblas_status_invalid_handle;
@@ -138,11 +138,11 @@ namespace
         if(batch_count < 0)
             return rocblas_status_invalid_size;
 
-        return rocblas_gemv_batched_template(handle,transA,m,n,alpha,A,lda,x,incx,beta,y,incy,batch_count);
+        return rocblas_gemv_batched_template(
+            handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy, batch_count);
     }
 } // namespace
 
-
 /*
  * ===========================================================================
  *    C wrapper
diff --git a/library/src/blas2/rocblas_gemv_strided_batched.cpp b/library/src/blas2/rocblas_gemv_strided_batched.cpp
index 6238b5ea0..8e2b6fc30 100644
--- a/library/src/blas2/rocblas_gemv_strided_batched.cpp
+++ b/library/src/blas2/rocblas_gemv_strided_batched.cpp
@@ -1,10 +1,10 @@
 /* ************************************************************************
  * Copyright 2016-2019 Advanced Micro Devices, Inc.
  * ************************************************************************ */
-#include "rocblas_gemv.hpp"
 #include "handle.h"
 #include "logging.h"
 #include "rocblas.h"
+#include "rocblas_gemv.hpp"
 #include "utility.h"
 
 namespace
@@ -22,21 +22,21 @@ namespace
 
     template <typename T>
     rocblas_status rocblas_gemv_strided_batched_impl(rocblas_handle    handle,
-                                                rocblas_operation transA,
-                                                rocblas_int       m,
-                                                rocblas_int       n,
-                                                const T*          alpha,
-                                                const T*          A,
-                                                rocblas_int       lda,
-                                                rocblas_int       strideA,
-                                                const T*          x,
-                                                rocblas_int       incx,
-                                                rocblas_int       stridex,
-                                                const T*          beta,
-                                                T*                y,
-                                                rocblas_int       incy,
-                                                rocblas_int       stridey,
-                                                rocblas_int       batch_count)
+                                                     rocblas_operation transA,
+                                                     rocblas_int       m,
+                                                     rocblas_int       n,
+                                                     const T*          alpha,
+                                                     const T*          A,
+                                                     rocblas_int       lda,
+                                                     rocblas_int       strideA,
+                                                     const T*          x,
+                                                     rocblas_int       incx,
+                                                     rocblas_int       stridex,
+                                                     const T*          beta,
+                                                     T*                y,
+                                                     rocblas_int       incy,
+                                                     rocblas_int       stridey,
+                                                     rocblas_int       batch_count)
     {
         if(!handle)
             return rocblas_status_invalid_handle;
@@ -185,17 +185,24 @@ namespace
             return rocblas_status_invalid_size;
 
         return rocblas_gemv_strided_batched_template(handle,
-                                                    transA,
-                                                    m, n, alpha,
-                                                    A, lda, strideA,
-                                                    x, incx, stridex,
-                                                    beta,
-                                                    y, incy, stridey,
-                                                    batch_count);
+                                                     transA,
+                                                     m,
+                                                     n,
+                                                     alpha,
+                                                     A,
+                                                     lda,
+                                                     strideA,
+                                                     x,
+                                                     incx,
+                                                     stridex,
+                                                     beta,
+                                                     y,
+                                                     incy,
+                                                     stridey,
+                                                     batch_count);
     }
 } //namespace
 
-
 /*
 * ===========================================================================
 *    C wrapper
@@ -222,21 +229,21 @@ rocblas_status rocblas_sgemv_strided_batched(rocblas_handle    handle,
                                              rocblas_int       batch_count)
 {
     return rocblas_gemv_strided_batched_impl(handle,
-                                        transA,
-                                        m,
-                                        n,
-                                        alpha,
-                                        A,
-                                        lda,
-                                        strideA,
-                                        x,
-                                        incx,
-                                        stridex,
-                                        beta,
-                                        y,
-                                        incy,
-                                        stridey,
-                                        batch_count);
+                                             transA,
+                                             m,
+                                             n,
+                                             alpha,
+                                             A,
+                                             lda,
+                                             strideA,
+                                             x,
+                                             incx,
+                                             stridex,
+                                             beta,
+                                             y,
+                                             incy,
+                                             stridey,
+                                             batch_count);
 }
 
 rocblas_status rocblas_dgemv_strided_batched(rocblas_handle    handle,
@@ -257,21 +264,21 @@ rocblas_status rocblas_dgemv_strided_batched(rocblas_handle    handle,
                                              rocblas_int       batch_count)
 {
     return rocblas_gemv_strided_batched_impl(handle,
-                                        transA,
-                                        m,
-                                        n,
-                                        alpha,
-                                        A,
-                                        lda,
-                                        strideA,
-                                        x,
-                                        incx,
-                                        stridex,
-                                        beta,
-                                        y,
-                                        incy,
-                                        stridey,
-                                        batch_count);
+                                             transA,
+                                             m,
+                                             n,
+                                             alpha,
+                                             A,
+                                             lda,
+                                             strideA,
+                                             x,
+                                             incx,
+                                             stridex,
+                                             beta,
+                                             y,
+                                             incy,
+                                             stridey,
+                                             batch_count);
 }
 
 rocblas_status rocblas_cgemv_strided_batched(rocblas_handle               handle,
@@ -292,21 +299,21 @@ rocblas_status rocblas_cgemv_strided_batched(rocblas_handle               handle
                                              rocblas_int                  batch_count)
 {
     return rocblas_gemv_strided_batched_impl(handle,
-                                        transA,
-                                        m,
-                                        n,
-                                        alpha,
-                                        A,
-                                        lda,
-                                        strideA,
-                                        x,
-                                        incx,
-                                        stridex,
-                                        beta,
-                                        y,
-                                        incy,
-                                        stridey,
-                                        batch_count);
+                                             transA,
+                                             m,
+                                             n,
+                                             alpha,
+                                             A,
+                                             lda,
+                                             strideA,
+                                             x,
+                                             incx,
+                                             stridex,
+                                             beta,
+                                             y,
+                                             incy,
+                                             stridey,
+                                             batch_count);
 }
 
 rocblas_status rocblas_zgemv_strided_batched(rocblas_handle                handle,
@@ -327,21 +334,21 @@ rocblas_status rocblas_zgemv_strided_batched(rocblas_handle                handl
                                              rocblas_int                   batch_count)
 {
     return rocblas_gemv_strided_batched_impl(handle,
-                                        transA,
-                                        m,
-                                        n,
-                                        alpha,
-                                        A,
-                                        lda,
-                                        strideA,
-                                        x,
-                                        incx,
-                                        stridex,
-                                        beta,
-                                        y,
-                                        incy,
-                                        stridey,
-                                        batch_count);
+                                             transA,
+                                             m,
+                                             n,
+                                             alpha,
+                                             A,
+                                             lda,
+                                             strideA,
+                                             x,
+                                             incx,
+                                             stridex,
+                                             beta,
+                                             y,
+                                             incy,
+                                             stridey,
+                                             batch_count);
 }
 
 } // extern "C"

From 97f69d6a408394fce3c1729ea4677045c2f4c7de Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Wed, 14 Aug 2019 09:02:35 -0600
Subject: [PATCH 3/5] Resolve merge conflicts

---
 clients/common/norm.cpp  | 868 ---------------------------------------
 clients/include/norm.hpp | 356 ++++++++++++++--
 2 files changed, 326 insertions(+), 898 deletions(-)
 delete mode 100644 clients/common/norm.cpp

diff --git a/clients/common/norm.cpp b/clients/common/norm.cpp
deleted file mode 100644
index cdcc5fb6a..000000000
--- a/clients/common/norm.cpp
+++ /dev/null
@@ -1,868 +0,0 @@
-/* ************************************************************************
- * Copyright 2018-2019 Advanced Micro Devices, Inc.
- *
- * ************************************************************************ */
-
-#include "norm.hpp"
-#include "cblas.h"
-#include "rocblas.h"
-#include "rocblas_vector.hpp"
-#include "utility.hpp"
-#include <cstdio>
-#include <limits>
-#include <memory>
-
-/* =====================================================================
-     README: Norm check: norm(A-B)/norm(A), evaluate relative error
-             Numerically, it is recommended by lapack.
-
-    Call lapack fortran routines that do not exsit in cblas library.
-    No special header is required. But need to declare
-    function prototype
-
-    All the functions are fortran and should append underscore (_) while
-    declaring prototype and calling.
-    xlange and xaxpy prototype are like following
-    =================================================================== */
-
-extern "C" {
-float  slange_(char* norm_type, int* m, int* n, float* A, int* lda, float* work);
-double dlange_(char* norm_type, int* m, int* n, double* A, int* lda, double* work);
-float  clange_(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work);
-double zlange_(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work);
-
-float  slansy_(char* norm_type, char* uplo, int* n, float* A, int* lda, float* work);
-double dlansy_(char* norm_type, char* uplo, int* n, double* A, int* lda, double* work);
-float clanhe_(char* norm_type, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* work);
-double
-    zlanhe_(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work);
-
-void saxpy_(int* n, float* alpha, float* x, int* incx, float* y, int* incy);
-void daxpy_(int* n, double* alpha, double* x, int* incx, double* y, int* incy);
-void caxpy_(
-    int* n, float* alpha, rocblas_float_complex* x, int* incx, rocblas_float_complex* y, int* incy);
-void zaxpy_(int*                    n,
-            double*                 alpha,
-            rocblas_double_complex* x,
-            int*                    incx,
-            rocblas_double_complex* y,
-            int*                    incy);
-}
-
-/* ============================Norm Check for General Matrix: float/double/complex template
- * speciliazation ======================================= */
-
-/*! \brief compare the norm error of two matrices hCPU & hGPU */
-template <>
-double norm_check_general<rocblas_bfloat16>(char              norm_type,
-                                            rocblas_int       M,
-                                            rocblas_int       N,
-                                            rocblas_int       lda,
-                                            rocblas_bfloat16* hCPU,
-                                            rocblas_bfloat16* hGPU)
-{
-    // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    double error_double = std::numeric_limits<double>::quiet_NaN();
-
-    host_vector<float> hCPU_float(N * lda), hGPU_float(N * lda);
-    for(rocblas_int i = 0; i < N * lda; i++)
-    {
-        hCPU_float[i] = float(hCPU[i]);
-        hGPU_float[i] = float(hGPU[i]);
-    }
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = slange_(&norm_type, &M, &N, hCPU_float, &lda, &work);
-    saxpy_(&size, &alpha, hCPU_float, &incx, hGPU_float, &incx);
-
-    float error_float = slange_(&norm_type, &M, &N, hGPU_float, &lda, &work) / cpu_norm;
-    error_double      = double(error_float);
-
-    return error_double;
-}
-
-template <>
-double norm_check_general<rocblas_half>(char          norm_type,
-                                        rocblas_int   M,
-                                        rocblas_int   N,
-                                        rocblas_int   lda,
-                                        rocblas_half* hCPU,
-                                        rocblas_half* hGPU)
-{
-    // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    double error_double = std::numeric_limits<double>::quiet_NaN();
-
-    host_vector<float> hCPU_float(N * lda), hGPU_float(N * lda);
-    for(rocblas_int i = 0; i < N * lda; i++)
-    {
-        hCPU_float[i] = half_to_float(hCPU[i]);
-        hGPU_float[i] = half_to_float(hGPU[i]);
-    }
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = slange_(&norm_type, &M, &N, hCPU_float, &lda, &work);
-    saxpy_(&size, &alpha, hCPU_float, &incx, hGPU_float, &incx);
-
-    float error_float = slange_(&norm_type, &M, &N, hGPU_float, &lda, &work) / cpu_norm;
-    error_double      = double(error_float);
-
-    return error_double;
-}
-
-template <>
-double norm_check_general<float>(
-    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, float* hCPU, float* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = slange_(&norm_type, &M, &N, hCPU, &lda, &work);
-    saxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    float error = slange_(&norm_type, &M, &N, hGPU, &lda, &work) / cpu_norm;
-
-    return (double)error;
-}
-
-template <>
-double norm_check_general<double>(
-    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, double* hCPU, double* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    double      work[1];
-    rocblas_int incx  = 1;
-    double      alpha = -1.0;
-    rocblas_int size  = lda * N;
-
-    double cpu_norm = dlange_(&norm_type, &M, &N, hCPU, &lda, work);
-    daxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    double error = dlange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm;
-
-    return error;
-}
-
-template <>
-double norm_check_general<int32_t>(
-    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, int32_t* hCPU, int32_t* hGPU)
-{
-    // Upconvert int32_t to double and call double version
-    host_vector<double> hCPU_double(M * N), hGPU_double(M * N);
-
-    for(int i = 0; i < M * N; i++)
-    {
-        hCPU_double[i] = double(hCPU[i]);
-        hGPU_double[i] = double(hGPU[i]);
-    }
-    return norm_check_general<double>(norm_type, M, N, lda, hCPU_double, hGPU_double);
-}
-
-template <>
-double norm_check_general<rocblas_float_complex>(char                   norm_type,
-                                                 rocblas_int            M,
-                                                 rocblas_int            N,
-                                                 rocblas_int            lda,
-                                                 rocblas_float_complex* hCPU,
-                                                 rocblas_float_complex* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    float       work[1];
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = clange_(&norm_type, &M, &N, hCPU, &lda, work);
-    caxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    float error = clange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm;
-
-    return (double)error;
-}
-
-template <>
-double norm_check_general<rocblas_double_complex>(char                    norm_type,
-                                                  rocblas_int             M,
-                                                  rocblas_int             N,
-                                                  rocblas_int             lda,
-                                                  rocblas_double_complex* hCPU,
-                                                  rocblas_double_complex* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-
-    double      work[1];
-    rocblas_int incx  = 1;
-    double      alpha = -1.0;
-    rocblas_int size  = lda * N;
-
-    double cpu_norm = zlange_(&norm_type, &M, &N, hCPU, &lda, work);
-    zaxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    double error = zlange_(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm;
-
-    return error;
-}
-
-//=====Norm Check for strided_batched matrix
-template <>
-double norm_check_general<rocblas_bfloat16>(char              norm_type,
-                                            rocblas_int       M,
-                                            rocblas_int       N,
-                                            rocblas_int       lda,
-                                            rocblas_int       stride_a,
-                                            rocblas_int       batch_count,
-                                            rocblas_bfloat16* hCPU,
-                                            rocblas_bfloat16* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    rocblas_int        totalsize = N * lda + (batch_count - 1) * stride_a;
-    host_vector<float> hCPU_float(totalsize), hGPU_float(totalsize);
-    for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++)
-    {
-        for(rocblas_int i = 0; i < N * lda; i++)
-        {
-            auto index        = i + i_batch * stride_a;
-            hCPU_float[index] = float(hCPU[index]);
-            hGPU_float[index] = float(hGPU[index]);
-        }
-    }
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(rocblas_int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = slange_(&norm_type, &M, &N, &hCPU_float[i * stride_a], &lda, &work);
-
-        saxpy_(&size, &alpha, &hCPU_float[i * stride_a], &incx, &hGPU_float[i * stride_a], &incx);
-
-        float error
-            = slange_(&norm_type, &M, &N, &hGPU_float[i * stride_a], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<rocblas_half>(char          norm_type,
-                                        rocblas_int   M,
-                                        rocblas_int   N,
-                                        rocblas_int   lda,
-                                        rocblas_int   stride_a,
-                                        rocblas_int   batch_count,
-                                        rocblas_half* hCPU,
-                                        rocblas_half* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    rocblas_int        totalsize = N * lda + (batch_count - 1) * stride_a;
-    host_vector<float> hCPU_float(totalsize), hGPU_float(totalsize);
-    for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++)
-    {
-        for(rocblas_int i = 0; i < N * lda; i++)
-        {
-            auto index        = i + i_batch * stride_a;
-            hCPU_float[index] = half_to_float(hCPU[index]);
-            hGPU_float[index] = half_to_float(hGPU[index]);
-        }
-    }
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(rocblas_int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = slange_(&norm_type, &M, &N, &hCPU_float[i * stride_a], &lda, &work);
-
-        saxpy_(&size, &alpha, &hCPU_float[i * stride_a], &incx, &hGPU_float[i * stride_a], &incx);
-
-        float error
-            = slange_(&norm_type, &M, &N, &hGPU_float[i * stride_a], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general(char         norm_type,
-                          rocblas_int  M,
-                          rocblas_int  N,
-                          rocblas_int  lda,
-                          rocblas_int  stride_a,
-                          rocblas_int  batch_count,
-                          rocblas_int* hCPU,
-                          rocblas_int* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    rocblas_int         totalsize = N * lda + (batch_count - 1) * stride_a;
-    host_vector<double> hCPU_double(totalsize), hGPU_double(totalsize);
-    for(rocblas_int i_batch = 0; i_batch < batch_count; i_batch++)
-    {
-        for(rocblas_int i = 0; i < N * lda; i++)
-        {
-            auto index         = i + i_batch * stride_a;
-            hCPU_double[index] = hCPU[index];
-            hGPU_double[index] = hGPU[index];
-        }
-    }
-
-    double      work;
-    rocblas_int incx             = 1;
-    double      alpha            = -1.0f;
-    rocblas_int size             = lda * N;
-    double      cumulative_error = 0.0;
-
-    for(rocblas_int i = 0; i < batch_count; i++)
-    {
-        double cpu_norm = dlange_(&norm_type, &M, &N, &hCPU_double[i * stride_a], &lda, &work);
-
-        daxpy_(&size, &alpha, &hCPU_double[i * stride_a], &incx, &hGPU_double[i * stride_a], &incx);
-
-        double error
-            = dlange_(&norm_type, &M, &N, &hGPU_double[i * stride_a], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<float>(char        norm_type,
-                                 rocblas_int M,
-                                 rocblas_int N,
-                                 rocblas_int lda,
-                                 rocblas_int stride_a,
-                                 rocblas_int batch_count,
-                                 float*      hCPU,
-                                 float*      hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = slange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
-
-        saxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
-
-        float error = slange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<double>(char        norm_type,
-                                  rocblas_int M,
-                                  rocblas_int N,
-                                  rocblas_int lda,
-                                  rocblas_int stride_a,
-                                  rocblas_int batch_count,
-                                  double*     hCPU,
-                                  double*     hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    double      work;
-    rocblas_int incx  = 1;
-    double      alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        double cpu_norm = dlange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
-
-        daxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
-
-        double error = dlange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<rocblas_float_complex>(char                   norm_type,
-                                                 rocblas_int            M,
-                                                 rocblas_int            N,
-                                                 rocblas_int            lda,
-                                                 rocblas_int            stride_a,
-                                                 rocblas_int            batch_count,
-                                                 rocblas_float_complex* hCPU,
-                                                 rocblas_float_complex* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = clange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
-
-        caxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
-
-        float error = clange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<rocblas_double_complex>(char                    norm_type,
-                                                  rocblas_int             M,
-                                                  rocblas_int             N,
-                                                  rocblas_int             lda,
-                                                  rocblas_int             stride_a,
-                                                  rocblas_int             batch_count,
-                                                  rocblas_double_complex* hCPU,
-                                                  rocblas_double_complex* hGPU)
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    double      work;
-    rocblas_int incx  = 1;
-    double      alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        double cpu_norm = zlange_(&norm_type, &M, &N, &(hCPU[i * stride_a]), &lda, &work);
-
-        zaxpy_(&size, &alpha, &(hCPU[i * stride_a]), &incx, &(hGPU[i * stride_a]), &incx);
-
-        double error = zlange_(&norm_type, &M, &N, &(hGPU[i * stride_a]), &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-//=====Norm Check for batched matrix
-template <>
-double norm_check_general<float>(char               norm_type,
-                                 rocblas_int        M,
-                                 rocblas_int        N,
-                                 rocblas_int        lda,
-                                 rocblas_int        batch_count,
-                                 host_vector<float> hCPU[],
-                                 host_vector<float> hGPU[])
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = slange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
-
-        saxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
-
-        float error = slange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<double>(char                norm_type,
-                                  rocblas_int         M,
-                                  rocblas_int         N,
-                                  rocblas_int         lda,
-                                  rocblas_int         batch_count,
-                                  host_vector<double> hCPU[],
-                                  host_vector<double> hGPU[])
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    double      work;
-    rocblas_int incx  = 1;
-    double      alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        double cpu_norm = dlange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
-
-        daxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
-
-        double error = dlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<rocblas_float_complex>(char                               norm_type,
-                                                 rocblas_int                        M,
-                                                 rocblas_int                        N,
-                                                 rocblas_int                        lda,
-                                                 rocblas_int                        batch_count,
-                                                 host_vector<rocblas_float_complex> hCPU[],
-                                                 host_vector<rocblas_float_complex> hGPU[])
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    float       work;
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        float cpu_norm = clange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
-
-        caxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
-
-        float error = clange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-template <>
-double norm_check_general<rocblas_double_complex>(char                                norm_type,
-                                                  rocblas_int                         M,
-                                                  rocblas_int                         N,
-                                                  rocblas_int                         lda,
-                                                  rocblas_int                         batch_count,
-                                                  host_vector<rocblas_double_complex> hCPU[],
-                                                  host_vector<rocblas_double_complex> hGPU[])
-{
-    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
-    // one norm is max column sum
-    // infinity norm is max row sum
-    // Frobenius is l2 norm of matrix entries
-    //
-    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
-    // of strided batched matrix
-
-    double      work;
-    rocblas_int incx  = 1;
-    double      alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    double cumulative_error = 0.0;
-
-    for(int i = 0; i < batch_count; i++)
-    {
-        double cpu_norm = zlange_(&norm_type, &M, &N, hCPU[i], &lda, &work);
-
-        zaxpy_(&size, &alpha, hCPU[i], &incx, hGPU[i], &incx);
-
-        double error = zlange_(&norm_type, &M, &N, hGPU[i], &lda, &work) / cpu_norm;
-
-        if(norm_type == 'F' || norm_type == 'f')
-        {
-            cumulative_error += error;
-        }
-        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
-        {
-            cumulative_error = cumulative_error > error ? cumulative_error : error;
-        }
-    }
-
-    return cumulative_error;
-}
-
-/* ============================Norm Check for Symmetric Matrix: float/double/complex template
- * speciliazation ======================================= */
-
-/*! \brief compare the norm error of two hermitian/symmetric matrices hCPU & hGPU */
-
-template <>
-double norm_check_symmetric<float>(
-    char norm_type, char uplo, rocblas_int N, rocblas_int lda, float* hCPU, float* hGPU)
-{
-    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
-
-    float       work[1];
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = slansy_(&norm_type, &uplo, &N, hCPU, &lda, work);
-    saxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    float error = slansy_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm;
-
-    return (double)error;
-}
-
-template <>
-double norm_check_symmetric<double>(
-    char norm_type, char uplo, rocblas_int N, rocblas_int lda, double* hCPU, double* hGPU)
-{
-    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
-
-    double      work[1];
-    rocblas_int incx  = 1;
-    double      alpha = -1.0;
-    rocblas_int size  = lda * N;
-
-    double cpu_norm = dlansy_(&norm_type, &uplo, &N, hCPU, &lda, work);
-    daxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    double error = dlansy_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm;
-
-    return error;
-}
-
-template <>
-double norm_check_symmetric<rocblas_float_complex>(char                   norm_type,
-                                                   char                   uplo,
-                                                   rocblas_int            N,
-                                                   rocblas_int            lda,
-                                                   rocblas_float_complex* hCPU,
-                                                   rocblas_float_complex* hGPU)
-{
-    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
-
-    float       work[1];
-    rocblas_int incx  = 1;
-    float       alpha = -1.0f;
-    rocblas_int size  = lda * N;
-
-    float cpu_norm = clanhe_(&norm_type, &uplo, &N, hCPU, &lda, work);
-    caxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    float error = clanhe_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm;
-
-    return (double)error;
-}
-
-template <>
-double norm_check_symmetric<rocblas_double_complex>(char                    norm_type,
-                                                    char                    uplo,
-                                                    rocblas_int             N,
-                                                    rocblas_int             lda,
-                                                    rocblas_double_complex* hCPU,
-                                                    rocblas_double_complex* hGPU)
-{
-    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
-
-    double      work[1];
-    rocblas_int incx  = 1;
-    double      alpha = -1.0;
-    rocblas_int size  = lda * N;
-
-    double cpu_norm = zlanhe_(&norm_type, &uplo, &N, hCPU, &lda, work);
-    zaxpy_(&size, &alpha, hCPU, &incx, hGPU, &incx);
-
-    double error = zlanhe_(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm;
-
-    return error;
-}
diff --git a/clients/include/norm.hpp b/clients/include/norm.hpp
index d81f8c4c8..c89b38ce9 100644
--- a/clients/include/norm.hpp
+++ b/clients/include/norm.hpp
@@ -6,8 +6,15 @@
 #ifndef _NORM_H
 #define _NORM_H
 
+#include "cblas.h"
+#include "norm.hpp"
 #include "rocblas.h"
 #include "rocblas_vector.hpp"
+#include "utility.hpp"
+#include <cstdio>
+#include <limits>
+#include <memory>
+
 
 /* =====================================================================
         Norm check: norm(A-B)/norm(A), evaluate relative error
@@ -20,39 +27,328 @@
 /* ========================================Norm Check
  * ==================================================== */
 
-/*! \brief  Template: norm check for general Matrix: half/float/doubel/complex  */
+/* LAPACK fortran library functionality */
+extern "C" {
+float  slange_(char* norm_type, int* m, int* n, float* A, int* lda, float* work);
+double dlange_(char* norm_type, int* m, int* n, double* A, int* lda, double* work);
+float  clange_(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work);
+double zlange_(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work);
 
-// see check_norm.cpp for template speciliazation
-// use auto as the return type is only allowed in c++14
-// convert float/float to double
-template <typename T>
-double norm_check_general(
-    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU);
+float  slansy_(char* norm_type, char* uplo, int* n, float* A, int* lda, float* work);
+double dlansy_(char* norm_type, char* uplo, int* n, double* A, int* lda, double* work);
 
-/*! \brief  Template: norm check for strided_batched Matrix: half/float/double/complex */
-template <typename T>
-double norm_check_general(char        norm_type,
-                          rocblas_int M,
-                          rocblas_int N,
-                          rocblas_int lda,
-                          rocblas_int stride_a,
-                          rocblas_int batch_count,
-                          T*          hCPU,
-                          T*          hGPU);
-
-/*! \brief  Template: norm check for batched Matrix: half/float/double/complex */
+float clanhe_(char* norm_type, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* work);
+double zlanhe_(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work);
+
+void saxpy_(int* n, float* alpha, float* x, int* incx, float* y, int* incy);
+void daxpy_(int* n, double* alpha, double* x, int* incx, double* y, int* incy);
+void caxpy_(
+    int* n, float* alpha, rocblas_float_complex* x, int* incx, rocblas_float_complex* y, int* incy);
+void zaxpy_(int*                    n,
+            double*                 alpha,
+            rocblas_double_complex* x,
+            int*                    incx,
+            rocblas_double_complex* y,
+            int*                    incy);
+}
+
+/*! \brief  Overloading: norm check for general Matrix: half/float/doubel/complex */
+inline float xlange(char* norm_type, int* m, int* n, float* A, int* lda, float* work)
+{
+    return slange_(norm_type, m, n, A, lda, work);
+}
+
+inline double xlange(char* norm_type, int* m, int* n, double* A, int* lda, double* work)
+{
+    return dlange_(norm_type, m, n, A, lda, work);
+}
+
+inline float
+    xlange(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work)
+{
+    return clange_(norm_type, m, n, A, lda, work);
+}
+
+inline double
+    xlange(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work)
+{
+    return zlange_(norm_type, m, n, A, lda, work);
+}
+
+inline float xlanhe(char* norm_type, char* uplo, int* n, float* A, int* lda, float* work)
+{
+    return slansy_(norm_type, uplo, n, A, lda, work);
+}
+
+inline double xlanhe(char* norm_type, char* uplo, int* n, double* A, int* lda, double* work)
+{
+    return dlansy_(norm_type, uplo, n, A, lda, work);
+}
+
+inline float
+    xlanhe(char* norm_type, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* work)
+{
+    return clanhe_(norm_type, uplo, n, A, lda, work);
+}
+
+inline double
+    xlanhe(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work)
+{
+    return zlanhe_(norm_type, uplo, n, A, lda, work);
+}
+
+inline void xaxpy(int* n, float* alpha, float* x, int* incx, float* y, int* incy)
+{
+    return saxpy_(n, alpha, x, incx, y, incy);
+}
+
+inline void xaxpy(int* n, double* alpha, double* x, int* incx, double* y, int* incy)
+{
+    return daxpy_(n, alpha, x, incx, y, incy);
+}
+
+inline void xaxpy(
+    int* n, float* alpha, rocblas_float_complex* x, int* incx, rocblas_float_complex* y, int* incy)
+{
+    return caxpy_(n, alpha, x, incx, y, incy);
+}
+
+inline void xaxpy(int*                    n,
+                  double*                 alpha,
+                  rocblas_double_complex* x,
+                  int*                    incx,
+                  rocblas_double_complex* y,
+                  int*                    incy)
+{
+    return zaxpy_(n, alpha, x, incx, y, incy);
+}
+
+
+/* ============== Norm Check for General Matrix ============= */
+/*! \brief compare the norm error of two matrices hCPU & hGPU */
+template <typename T, typename std::enable_if<!is_complex<T>, int>::type = 0>
+inline double norm_check_general(
+    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU)
+{
+    // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+
+    host_vector<double> hCPU_double(N * lda);
+    host_vector<double> hGPU_double(N * lda);
+
+    for(rocblas_int i = 0; i < N * lda; i++)
+    {
+        hCPU_double[i] = double(hCPU[i]);
+        hGPU_double[i] = double(hGPU[i]);
+    }
+
+    double      work[1];
+    rocblas_int incx  = 1;
+    double      alpha = -1.0;
+    rocblas_int size  = lda * N;
+
+    double cpu_norm = xlange(&norm_type, &M, &N, hCPU_double.data(), &lda, work);
+    xaxpy(&size, &alpha, hCPU_double.data(), &incx, hGPU_double.data(), &incx);
+    double error = xlange(&norm_type, &M, &N, hGPU_double.data(), &lda, work) / cpu_norm;
+
+    return error;
+}
+
+template <typename T, typename std::enable_if<is_complex<T>, int>::type = 0>
+inline double norm_check_general(
+    char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU)
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+
+    decltype(std::real(*hCPU)) work[1];
+    rocblas_int                incx  = 1;
+    decltype(std::real(*hCPU)) alpha = -1.0f;
+    rocblas_int                size  = lda * N;
+
+    double cpu_norm = xlange(&norm_type, &M, &N, hCPU, &lda, work);
+    xaxpy(&size, &alpha, hCPU, &incx, hGPU, &incx);
+    double error = xlange(&norm_type, &M, &N, hGPU, &lda, work) / cpu_norm;
+
+    return error;
+}
+
+template <>
+inline double norm_check_general<rocblas_half, 0>(char          norm_type,
+                                                  rocblas_int   M,
+                                                  rocblas_int   N,
+                                                  rocblas_int   lda,
+                                                  rocblas_half* hCPU,
+                                                  rocblas_half* hGPU)
+
+{
+    // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+
+    host_vector<double> hCPU_double(N * lda);
+    host_vector<double> hGPU_double(N * lda);
+
+    for(rocblas_int i = 0; i < N * lda; i++)
+    {
+        hCPU_double[i] = double(half_to_float(hCPU[i]));
+        hGPU_double[i] = double(half_to_float(hGPU[i]));
+    }
+
+    return norm_check_general(norm_type, M, N, lda, hCPU_double.data(), hGPU_double.data());
+}
+
+/* ============== Norm Check for strided_batched case ============= */
 template <typename T>
-double norm_check_general(char           norm_type,
-                          rocblas_int    M,
-                          rocblas_int    N,
-                          rocblas_int    lda,
-                          rocblas_int    batch_count,
-                          host_vector<T> hCPU[],
-                          host_vector<T> hGPU[]);
-
-/*! \brief  Template: norm check for hermitian/symmetric Matrix: half/float/double/complex */
+inline double norm_check_general(char        norm_type,
+                                 rocblas_int M,
+                                 rocblas_int N,
+                                 rocblas_int lda,
+                                 rocblas_int stride_a,
+                                 rocblas_int batch_count,
+                                 T*          hCPU,
+                                 T*          hGPU)
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    double cumulative_error = 0.0;
+
+    for(rocblas_int i = 0; i < batch_count; i++)
+    {
+        auto index = i * stride_a;
+
+        auto error = norm_check_general(norm_type, M, N, lda, hCPU + index, hGPU + index);
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+/* ============== Norm Check for batched case ============= */
 template <typename T>
-double norm_check_symmetric(
-    char norm_type, char uplo, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU);
+inline double norm_check_general(char        norm_type,
+                                 rocblas_int M,
+                                 rocblas_int N,
+                                 rocblas_int lda,
+                                 rocblas_int batch_count,
+                                 host_vector<T>   hCPU[],
+                                 host_vector<T>   hGPU[])
+{
+    // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
+    // one norm is max column sum
+    // infinity norm is max row sum
+    // Frobenius is l2 norm of matrix entries
+    //
+    // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm
+    // of strided batched matrix
+
+    double cumulative_error = 0.0;
+
+    for(rocblas_int i = 0; i < batch_count; i++)
+    {
+        auto index = i;
+
+        auto error = norm_check_general(norm_type, M, N, lda, hCPU[index], hGPU[index]);
+
+        if(norm_type == 'F' || norm_type == 'f')
+        {
+            cumulative_error += error;
+        }
+        else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i')
+        {
+            cumulative_error = cumulative_error > error ? cumulative_error : error;
+        }
+    }
+
+    return cumulative_error;
+}
+
+
+/* ============== Norm Check for Symmetric Matrix ============= */
+/*! \brief compare the norm error of two hermitian/symmetric matrices hCPU & hGPU */
+template <typename T, typename std::enable_if<!is_complex<T>, int>::type = 0>
+inline double norm_check_symmetric(
+    char norm_type, char uplo, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU)
+{
+    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
+
+    double      work[1];
+    rocblas_int incx  = 1;
+    double      alpha = -1.0;
+    rocblas_int size  = lda * N;
+
+    host_vector<double> hCPU_double(N * lda);
+    host_vector<double> hGPU_double(N * lda);
+
+    for(rocblas_int i = 0; i < N * lda; i++)
+    {
+        hCPU_double[i] = double(hCPU[i]);
+        hGPU_double[i] = double(hGPU[i]);
+    }
+
+    double cpu_norm = xlanhe(&norm_type, &uplo, &N, hCPU_double, &lda, work);
+    xaxpy(&size, &alpha, hCPU_double, &incx, hGPU_double, &incx);
+    double error = xlanhe(&norm_type, &uplo, &N, hGPU_double, &lda, work) / cpu_norm;
+
+    return error;
+}
+
+template <typename T, typename std::enable_if<is_complex<T>, int>::type = 0>
+inline double norm_check_symmetric(
+    char norm_type, char uplo, rocblas_int N, rocblas_int lda, T* hCPU, T* hGPU)
+{
+    // norm type can be M', 'I', 'F', 'l': 'F' (Frobenius norm) is used mostly
+
+    decltype(std::real(*hCPU)) work[1];
+    rocblas_int                incx  = 1;
+    decltype(std::real(*hCPU)) alpha = -1.0;
+    rocblas_int                size  = lda * N;
+
+    double cpu_norm = xlanhe(&norm_type, &uplo, &N, hCPU, &lda, work);
+    xaxpy(&size, &alpha, hCPU, &incx, hGPU, &incx);
+    double error = xlanhe(&norm_type, &uplo, &N, hGPU, &lda, work) / cpu_norm;
+
+    return error;
+}
+
+template <>
+inline double norm_check_symmetric<rocblas_half, 0>(char          norm_type,
+                                                    char          uplo,
+                                                    rocblas_int   N,
+                                                    rocblas_int   lda,
+                                                    rocblas_half* hCPU,
+                                                    rocblas_half* hGPU)
+{
+    host_vector<double> hCPU_double(N * lda);
+    host_vector<double> hGPU_double(N * lda);
+
+    for(rocblas_int i = 0; i < N * lda; i++)
+    {
+        hCPU_double[i] = double(half_to_float(hCPU[i]));
+        hGPU_double[i] = double(half_to_float(hGPU[i]));
+    }
+
+    return norm_check_symmetric(norm_type, uplo, N, lda, hCPU_double.data(), hGPU_double.data());
+}
 
 #endif

From d7252e40980599749d9879c493929d614397dab0 Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Wed, 14 Aug 2019 09:20:13 -0600
Subject: [PATCH 4/5] clang formating

---
 clients/include/norm.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/clients/include/norm.hpp b/clients/include/norm.hpp
index b9f47fb2e..e5af1928a 100644
--- a/clients/include/norm.hpp
+++ b/clients/include/norm.hpp
@@ -36,7 +36,8 @@ double zlange_(char* norm_type, int* m, int* n, rocblas_double_complex* A, int*
 float  slansy_(char* norm_type, char* uplo, int* n, float* A, int* lda, float* work);
 double dlansy_(char* norm_type, char* uplo, int* n, double* A, int* lda, double* work);
 float clanhe_(char* norm_type, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* work);
-double zlanhe_(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work);
+double
+    zlanhe_(char* norm_type, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* work);
 
 void saxpy_(int* n, float* alpha, float* x, int* incx, float* y, int* incy);
 void daxpy_(int* n, double* alpha, double* x, int* incx, double* y, int* incy);
@@ -121,7 +122,6 @@ inline void xaxpy(int*                    n,
     return zaxpy_(n, alpha, x, incx, y, incy);
 }
 
-
 /* ============== Norm Check for General Matrix ============= */
 /*! \brief compare the norm error of two matrices hCPU & hGPU */
 template <typename T, typename std::enable_if<!is_complex<T>, int>::type = 0>
@@ -242,13 +242,13 @@ inline double norm_check_general(char        norm_type,
 
 /* ============== Norm Check for batched case ============= */
 template <typename T>
-inline double norm_check_general(char        norm_type,
-                                 rocblas_int M,
-                                 rocblas_int N,
-                                 rocblas_int lda,
-                                 rocblas_int batch_count,
-                                 host_vector<T>   hCPU[],
-                                 host_vector<T>   hGPU[])
+inline double norm_check_general(char           norm_type,
+                                 rocblas_int    M,
+                                 rocblas_int    N,
+                                 rocblas_int    lda,
+                                 rocblas_int    batch_count,
+                                 host_vector<T> hCPU[],
+                                 host_vector<T> hGPU[])
 {
     // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm
     // one norm is max column sum

From 73094bd8b121c1242257427e111ac39a8a239f1b Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Wed, 14 Aug 2019 13:28:03 -0600
Subject: [PATCH 5/5] Correct bugs in gemv complex

---
 clients/include/norm.hpp           | 2 +-
 library/src/blas2/rocblas_gemv.hpp | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clients/include/norm.hpp b/clients/include/norm.hpp
index e5af1928a..714037152 100644
--- a/clients/include/norm.hpp
+++ b/clients/include/norm.hpp
@@ -264,7 +264,7 @@ inline double norm_check_general(char           norm_type,
     {
         auto index = i;
 
-        auto error = norm_check_general(norm_type, M, N, lda, hCPU[index], hGPU[index]);
+        auto error = norm_check_general<T>(norm_type, M, N, lda, hCPU[index], hGPU[index]);
 
         if(norm_type == 'F' || norm_type == 'f')
         {
diff --git a/library/src/blas2/rocblas_gemv.hpp b/library/src/blas2/rocblas_gemv.hpp
index 4c3a9c57b..a3b6561eb 100644
--- a/library/src/blas2/rocblas_gemv.hpp
+++ b/library/src/blas2/rocblas_gemv.hpp
@@ -222,6 +222,8 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
         static constexpr int GEMVN_DIM_X = 64;
         static constexpr int GEMVN_DIM_Y = 16;
         rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
+        if(std::is_same<T, rocblas_double_complex>{})
+            blocks = (m - 1) / (GEMVN_DIM_X) + 1;
 
         dim3 gemvn_grid(blocks, batch_count);
         dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
@@ -319,7 +321,7 @@ rocblas_status rocblas_gemv_batched_template(rocblas_handle    handle,
         // conjugate transpose
         // number of columns on the y-dim of the grid
         static constexpr int NB = 256;
-        dim3                 gemvc_grid(n, 1);
+        dim3                 gemvc_grid(n, batch_count);
         dim3                 gemvc_threads(NB);
 
         if(handle->pointer_mode == rocblas_pointer_mode_device)
@@ -396,6 +398,8 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
         static constexpr int GEMVN_DIM_X = 64;
         static constexpr int GEMVN_DIM_Y = 16;
         rocblas_int          blocks      = (m - 1) / (GEMVN_DIM_X * 4) + 1;
+        if(std::is_same<T, rocblas_double_complex>{})
+            blocks = (m - 1) / (GEMVN_DIM_X) + 1;
 
         dim3 gemvn_grid(blocks, batch_count);
         dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
@@ -505,7 +509,7 @@ rocblas_status rocblas_gemv_strided_batched_template(rocblas_handle    handle,
         // conjugate transpose
         // number of columns on the y-dim of the grid
         static constexpr int NB = 256;
-        dim3                 gemvc_grid(n, 1);
+        dim3                 gemvc_grid(n, batch_count);
         dim3                 gemvc_threads(NB);
 
         if(handle->pointer_mode == rocblas_pointer_mode_device)