diff --git a/README.md b/README.md
index 4f26d94b4..c8b704924 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,10 @@ oneMKL interfaces is an open-source implementation of oneMKL Data Parallel C++ (
Intel(R) oneAPI Math Kernel Library for Intel GPU |
Intel GPU |
+
+ | NVIDIA cuBLAS for NVIDIA GPU |
+ NVIDIA GPU |
+
@@ -81,13 +85,13 @@ cl::sycl::queue cpu_queue(cpu_dev);
cl::sycl::queue gpu_queue(gpu_dev);
onemkl::blas::gemm(cpu_queue, transA, transB, m, ...);
-onemkl::blas::gemm(gpu_queue, transA, transB, m, ...);
+onemkl::blas::gemm(gpu_queue, transA, transB, m, ...);
```
How to build an application with run-time dispatching:
```cmd
$> clang++ -fsycl –I$ONEMKL/include app.cpp
-$> clang++ -fsycl app.o –L$ONEMKL/lib –lonemkl_blas_mklcpu –lonemkl_blas_mklgpu
+$> clang++ -fsycl app.o –L$ONEMKL/lib –lonemkl_blas_mklcpu –lonemkl_blas_cublas
```
### Supported Configurations:
@@ -100,6 +104,7 @@ Supported domains: BLAS
:------| :-------| :------------------
Intel CPU | Intel(R) oneAPI Math Kernel Library | Dynamic, Static
Intel GPU | Intel(R) oneAPI Math Kernel Library | Dynamic, Static
+ NVIDIA GPU | NVIDIA cuBLAS | Dynamic, Static
---
@@ -114,18 +119,19 @@ Supported domains: BLAS
- Intel(R) Xeon(R) Processor Family
- Accelerators
- Intel(R) Processor Graphics GEN9
+ - NVIDIA(R) TITAN RTX(TM) (Not tested with other NVIDIA GPU families and products.)
---
### Supported Operating Systems
#### Linux*
-Operating System | CPU Host/Target | Integrated Graphics from Intel (Intel GPU)
-:--- | :--- | :---
-Ubuntu | 18.04.3, 19.04 | 18.04.3, 19.10
-SUSE Linux Enterprise Server* | 15 | *Not supported*
-Red Hat Enterprise Linux* (RHEL*) | 8 | *Not supported*
-Linux* kernel | *N/A* | 4.11 or higher
+Operating System | CPU Host/Target | Integrated Graphics from Intel (Intel GPU) | NVIDIA GPU
+:--- | :--- | :--- | :---
+Ubuntu | 18.04.3, 19.04 | 18.04.3, 19.10 | 18.04.3
+SUSE Linux Enterprise Server* | 15 | *Not supported* | *Not supported*
+Red Hat Enterprise Linux* (RHEL*) | 8 | *Not supported* | *Not supported*
+Linux* kernel | *N/A* | 4.11 or higher | *N/A*
---
@@ -174,7 +180,7 @@ Linux* kernel | *N/A* | 4.11 or higher
- | Linux* |
+ Linux* |
Any |
GNU* GCC 5.1 or higher |
@@ -192,6 +198,11 @@ Linux* kernel | *N/A* | 4.11 or higher
| Intel(R) oneAPI Math Kernel Library |
+ NVIDIA GPU |
+ Intel project for LLVM* technology |
+
+ | NVIDIA CUDA SDK |
+
@@ -206,7 +217,9 @@ Python | 3.6 or higher | [PSF](https://docs.python.org/3.6/license.html)
[GNU* FORTRAN Compiler](https://gcc.gnu.org/wiki/GFortran) | 7.4.0 or higher | [GNU General Public License, version 3](https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gfortran/Copying.html)
[Intel(R) oneAPI DPC++ Compiler](https://software.intel.com/en-us/oneapi/dpc-compiler) | 2021.1-beta05 | [End User License Agreement for the Intel(R) Software Development Products](https://software.intel.com/en-us/license/eula-for-intel-software-development-products)
[Intel project for LLVM* technology binary for Intel CPU](https://github.com/intel/llvm/releases) | Daily builds (experimental) tested with [20200331](https://github.com/intel/llvm/releases/download/20200331/dpcpp-compiler.tar.gz) | [Apache License v2](https://github.com/intel/llvm/blob/sycl/sycl/LICENSE.TXT)
+[Intel project for LLVM* technology source for NVIDIA GPU](https://github.com/intel/llvm/releases) | Daily source releases: tested with [20200421](https://github.com/intel/llvm/tree/20200421) | [Apache License v2](https://github.com/intel/llvm/blob/sycl/sycl/LICENSE.TXT)
[Intel(R) oneAPI Math Kernel Library](https://software.intel.com/en-us/oneapi/onemkl) | 2021.1-beta05 | [Intel Simplified Software License](https://software.intel.com/en-us/license/intel-simplified-software-license)
+[NVIDIA CUDA SDK](https://developer.nvidia.com/cublas) | 10.2 | [End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html)
[NETLIB LAPACK](https://github.com/Reference-LAPACK/lapack) | 3.7.1 | [BSD like license](http://www.netlib.org/lapack/LICENSE.txt)
[Sphinx](https://www.sphinx-doc.org/en/master/) | 2.4.4 | [BSD License](https://github.com/sphinx-doc/sphinx/blob/3.x/LICENSE)
@@ -248,6 +261,7 @@ You can specify build options using `-D=`. The following ta
CMake Option | Supported Values | Default Value
:----------- | :--------------- | :---
BUILD_SHARED_LIBS | True, False | True
+ENABLE_CUBLAS_BACKEND | True, False | False
ENABLE_MKLCPU_BACKEND | True, False | True
ENABLE_MKLGPU_BACKEND | True, False | True
ENABLE_MKLCPU_THREAD_TBB | True, False | True
diff --git a/cmake/FindcuBLAS.cmake b/cmake/FindcuBLAS.cmake
new file mode 100644
index 000000000..06fe6fe59
--- /dev/null
+++ b/cmake/FindcuBLAS.cmake
@@ -0,0 +1,55 @@
+#==========================================================================
+# Copyright (C) Codeplay Software Limited
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# For your convenience, a copy of the License has been included in this
+# repository.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#=========================================================================
+
+find_package(CUDA 10.0 REQUIRED)
+find_path(CUBLAS_INCLUDE_DIR "cublas_v2.h" HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+get_filename_component(SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
+# the OpenCL include file from cuda is opencl 1.1 and it is not compatible with DPC++
+# the OpenCL include headers 1.2 onward is required. This is used to bypass NVIDIA OpenCL headers
+find_path(OPENCL_INCLUDE_DIR CL/cl.h OpenCL/cl.h
+HINTS
+${OPENCL_INCLUDE_DIR}
+${SYCL_BINARY_DIR}/../include/sycl/
+)
+find_library(CUBLAS_LIBRARY cublas)
+find_library(CUDA_DRIVER_LIBRARY cuda)
+# this is work around to avoid duplication half creation in both cuda and SYCL
+add_compile_definitions(CUDA_NO_HALF)
+
+find_package(Threads REQUIRED)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(cuBLAS
+ REQUIRED_VARS
+ CUBLAS_INCLUDE_DIR
+ CUDA_INCLUDE_DIRS
+ CUBLAS_LIBRARY
+ CUDA_LIBRARIES
+ CUDA_DRIVER_LIBRARY
+ OPENCL_INCLUDE_DIR
+)
+if(NOT TARGET ONEMKL::cuBLAS::cuBLAS)
+ add_library(ONEMKL::cuBLAS::cuBLAS SHARED IMPORTED)
+ set_target_properties(ONEMKL::cuBLAS::cuBLAS PROPERTIES
+ IMPORTED_LOCATION ${CUBLAS_LIBRARY}
+ INTERFACE_INCLUDE_DIRECTORIES "${OPENCL_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}"
+ INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_DRIVER_LIBRARY};${CUDA_LIBRARIES}"
+ )
+
+endif()
diff --git a/include/onemkl/blas/blas.hpp b/include/onemkl/blas/blas.hpp
index 6dd01af6c..d705048ad 100644
--- a/include/onemkl/blas/blas.hpp
+++ b/include/onemkl/blas/blas.hpp
@@ -31,6 +31,7 @@
#include "onemkl/blas/predicates.hpp"
#include "onemkl/blas/detail/blas_loader.hpp"
+#include "onemkl/blas/detail/cublas/blas_ct.hpp"
#include "onemkl/blas/detail/mklcpu/blas_ct.hpp"
#include "onemkl/blas/detail/mklgpu/blas_ct.hpp"
diff --git a/include/onemkl/blas/detail/cublas/blas_ct.hpp b/include/onemkl/blas/detail/cublas/blas_ct.hpp
new file mode 100644
index 000000000..4ac19b7f8
--- /dev/null
+++ b/include/onemkl/blas/detail/cublas/blas_ct.hpp
@@ -0,0 +1,3022 @@
+/***************************************************************************
+* Copyright (C) Codeplay Software Limited
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* For your convenience, a copy of the License has been included in this
+* repository.
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+**************************************************************************/
+//
+// Generated based on onemkl/blas/blas.hpp
+//
+
+#ifndef _DETAIL_CUBLAS_BLAS_HPP_
+#define _DETAIL_CUBLAS_BLAS_HPP_
+
+#include
+#include
+#include
+
+#include "onemkl/detail/backends.hpp"
+#include "onemkl/detail/libraries.hpp"
+#include "onemkl/types.hpp"
+
+#include "onemkl_blas_cublas.hpp"
+
+namespace onemkl {
+namespace blas {
+
+template
+static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ cl::sycl::buffer &a, std::int64_t lda);
+template <>
+void syr2(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ cl::sycl::buffer &a, std::int64_t lda) {
+ syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+}
+
+template
+static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ cl::sycl::buffer &a, std::int64_t lda);
+template <>
+void syr2(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ cl::sycl::buffer &a, std::int64_t lda) {
+ syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n,
+ std::complex alpha,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n,
+ std::complex alpha,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ scal_precondition(queue, n, alpha, x, incx);
+ onemkl::cublas::scal(queue, n, alpha, x, incx);
+ scal_postcondition(queue, n, alpha, x, incx);
+}
+
+template
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer &a, std::int64_t lda,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void trmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer &a, std::int64_t lda,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+}
+
+template
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer &a, std::int64_t lda,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void trmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer &a, std::int64_t lda,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+}
+
+template
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer, 1> &a,
+ std::int64_t lda, cl::sycl::buffer, 1> &x,
+ std::int64_t incx);
+template <>
+void trmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer, 1> &a,
+ std::int64_t lda,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+}
+
+template
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer, 1> &a,
+ std::int64_t lda, cl::sycl::buffer, 1> &x,
+ std::int64_t incx);
+template <>
+void trmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer, 1> &a,
+ std::int64_t lda,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+ trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
+}
+
+template
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer &a,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void tpmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer &a,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+}
+
+template
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer &a,
+ cl::sycl::buffer &x, std::int64_t incx);
+template <>
+void tpmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer &a,
+ cl::sycl::buffer &x, std::int64_t incx) {
+ tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+}
+
+template
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void tpmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+}
+
+template
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+ std::int64_t n, cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x, std::int64_t incx);
+template <>
+void tpmv(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, diag unit_diag, std::int64_t n,
+ cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx) {
+ tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+ tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
+}
+
+template
+static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &a);
+template <>
+void spr(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &a) {
+ spr_precondition(queue, upper_lower, n, alpha, x, incx, a);
+ onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a);
+ spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
+}
+
+template
+static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &a);
+template <>
+void spr(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &a) {
+ spr_precondition(queue, upper_lower, n, alpha, x, incx, a);
+ onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a);
+ spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
+}
+
+template
+static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+ std::complex alpha, cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ std::complex beta, cl::sycl::buffer, 1> &y,
+ std::int64_t incy);
+template <>
+void hpmv(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y,
+ std::int64_t incy) {
+ hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+ onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+ hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+}
+
+template
+static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+ std::complex alpha, cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ std::complex beta, cl::sycl::buffer, 1> &y,
+ std::int64_t incy);
+template <>
+void hpmv(cl::sycl::queue &queue, uplo upper_lower,
+ std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &a,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y,
+ std::int64_t incy) {
+ hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+ onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+ hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
+}
+
+template
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+ std::int64_t k, float alpha, cl::sycl::buffer &a,
+ std::int64_t lda, float beta, cl::sycl::buffer &c,
+ std::int64_t ldc);
+template <>
+void syrk(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, std::int64_t n, std::int64_t k,
+ float alpha, cl::sycl::buffer &a,
+ std::int64_t lda, float beta,
+ cl::sycl::buffer &c, std::int64_t ldc) {
+ syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+}
+
+template
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+ std::int64_t k, double alpha, cl::sycl::buffer &a,
+ std::int64_t lda, double beta, cl::sycl::buffer &c,
+ std::int64_t ldc);
+template <>
+void syrk(cl::sycl::queue &queue, uplo upper_lower,
+ transpose trans, std::int64_t n, std::int64_t k,
+ double alpha, cl::sycl::buffer &a,
+ std::int64_t lda, double beta,
+ cl::sycl::buffer &c, std::int64_t ldc) {
+ syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+}
+
+template
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+ std::int64_t k, std::complex alpha,
+ cl::sycl::buffer, 1> &a, std::int64_t lda,
+ std::complex beta, cl::sycl::buffer, 1> &c,
+ std::int64_t ldc);
+template <>
+void syrk(
+ cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda,
+ std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) {
+ syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+}
+
+template
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+ std::int64_t k, std::complex alpha,
+ cl::sycl::buffer, 1> &a, std::int64_t lda,
+ std::complex beta, cl::sycl::buffer, 1> &c,
+ std::int64_t ldc);
+template <>
+void syrk(
+ cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda,
+ std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) {
+ syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+ syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
+}
+
+template
+static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+ std::complex alpha, cl::sycl::buffer, 1> &x,
+ std::int64_t incx, cl::sycl::buffer, 1> &y,
+ std::int64_t incy, cl::sycl::buffer, 1> &a,
+ std::int64_t lda);
+template <>
+void her2(
+ cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ cl::sycl::buffer, 1> &y, std::int64_t incy,
+ cl::sycl::buffer, 1> &a, std::int64_t lda) {
+ her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+}
+
+template
+static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+ std::complex alpha, cl::sycl::buffer, 1> &x,
+ std::int64_t incx, cl::sycl::buffer, 1> &y,
+ std::int64_t incy, cl::sycl::buffer, 1> &a,
+ std::int64_t lda);
+template <>
+void her2(
+ cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ cl::sycl::buffer, 1> &y, std::int64_t incy,
+ cl::sycl::buffer, 1> &a, std::int64_t lda) {
+ her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+ her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
+}
+
+template
+static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a,
+ std::int64_t lda, cl::sycl::buffer, 1> &x,
+ std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y, std::int64_t incy);
+template <>
+void hbmv(
+ cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda,
+ cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y, std::int64_t incy) {
+ hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+ onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+ hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
+template
+static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a,
+ std::int64_t lda, cl::sycl::buffer, 1> &x,
+ std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y, std::int64_t incy);
+template <>
+void hbmv(
+ cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+ std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda,
+ cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta,
+ cl::sycl::buffer, 1> &y, std::int64_t incy) {
+ hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+ onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+ hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
+template
+static inline void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ cl::sycl::buffer, 1> &y, std::int64_t incy, float c,
+ float s);
+template <>
+void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx,
+ cl::sycl::buffer, 1> &y,
+ std::int64_t incy, float c, float s) {
+ rot_precondition(queue, n, x, incx, y, incy, c, s);
+ onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s);
+ rot_postcondition(queue, n, x, incx, y, incy, c, s);
+}
+
+template
+static inline void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ cl::sycl::buffer, 1> &y, std::int64_t incy, double c,
+ double s);
+template <>
+void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer, 1> &x,
+ std::int64_t incx,
+ cl::sycl::buffer, 1> &y,
+ std::int64_t incy, double c, double s) {
+ rot_precondition(queue, n, x, incx, y, incy, c, s);
+ onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s);
+ rot_postcondition(queue, n, x, incx, y, incy, c, s);
+}
+
+template
+static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x,
+ std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c,
+ float s);
+template <>
+void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ float c, float s) {
+ rot_precondition(queue, n, x, incx, y, incy, c, s);
+ onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s);
+ rot_postcondition(queue, n, x, incx, y, incy, c, s);
+}
+
+template
+static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x,
+ std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy,
+ double c, double s);
+template <>
+void rot(cl::sycl::queue &queue, std::int64_t n,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy,
+ double c, double s) {
+ rot_precondition(queue, n, x, incx, y, incy, c, s);
+ onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s);
+ rot_postcondition(queue, n, x, incx, y, incy, c, s);
+}
+
+template
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy);
+template <>
+void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy) {
+ axpy_precondition(queue, n, alpha, x, incx, y, incy);
+ onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy);
+ axpy_postcondition(queue, n, alpha, x, incx, y, incy);
+}
+
+template
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy);
+template <>
+void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
+ cl::sycl::buffer &x, std::int64_t incx,
+ cl::sycl::buffer &y, std::int64_t incy) {
+ axpy_precondition(queue, n, alpha, x, incx, y, incy);
+ onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy);
+ axpy_postcondition(queue, n, alpha, x, incx, y, incy);
+}
+
+template
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha,
+ cl::sycl::buffer, 1> &x, std::int64_t incx,
+ cl::sycl::buffer, 1> &y, std::int64_t incy);
+template <>
+void axpy(cl::sycl::queue &queue, std::int64_t n,
+ std::complex