From 6fe9ba67cf41fc5751bc2230b3a9b3234900eebc Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 8 Oct 2015 14:16:58 -0500 Subject: [PATCH 1/4] Update and rename clBLAS.h to aBLAS.h --- src/include/{clBLAS.h => aBLAS.h} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename src/include/{clBLAS.h => aBLAS.h} (97%) diff --git a/src/include/clBLAS.h b/src/include/aBLAS.h similarity index 97% rename from src/include/clBLAS.h rename to src/include/aBLAS.h index 974f69621..6eeaa3ff7 100644 --- a/src/include/clBLAS.h +++ b/src/include/aBLAS.h @@ -25,8 +25,8 @@ */ #pragma once -#ifndef _CL_BLAS_H_ -#define _CL_BLAS_H_ +#ifndef _aBLAS_H_ +#define _aBLAS_H_ #include @@ -103,7 +103,7 @@ CLBLAS_EXPORT clblasStatus * \param[in] control clBLAS state object */ CLBLAS_EXPORT clblasStatus - clblasGemm( const clblasScalar* alpha, + clblasGemm(const clblasScalar* alpha, const clblasMatrix* a, const clblasMatrix* b, const clblasScalar* beta, @@ -141,4 +141,4 @@ CLBLAS_EXPORT clblasStatus } // extern C #endif -#endif // _CL_BLAS_H_ +#endif // _aBLAS_H_ From 622881908057fb7f2b40b0a0c2b3ec18c03ddcf1 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 8 Oct 2015 14:20:49 -0500 Subject: [PATCH 2/4] Rename clBLAS-types.h to ablas_types.h --- src/include/{clBLAS-types.h => ablas_types.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/include/{clBLAS-types.h => ablas_types.h} (100%) diff --git a/src/include/clBLAS-types.h b/src/include/ablas_types.h similarity index 100% rename from src/include/clBLAS-types.h rename to src/include/ablas_types.h From 3a625fb8774eb4f8f00bc536c13d824c435ca725 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 8 Oct 2015 14:21:16 -0500 Subject: [PATCH 3/4] Rename aBLAS.h to ablas.h --- src/include/{aBLAS.h => ablas.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/include/{aBLAS.h => ablas.h} (100%) diff --git a/src/include/aBLAS.h b/src/include/ablas.h similarity index 100% rename from src/include/aBLAS.h rename to src/include/ablas.h From 9f7dcff581b8d031171656e341c7a9a8962c375a Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 8 Oct 2015 15:50:09 -0500 Subject: [PATCH 4/4] defining matrix and gemm --- src/include/ablas.h | 115 +++++++++++++++++------- src/include/ablas_types.h | 184 ++++++++++++-------------------------- 2 files changed, 141 insertions(+), 158 deletions(-) diff --git a/src/include/ablas.h b/src/include/ablas.h index 6eeaa3ff7..21949ce81 100644 --- a/src/include/ablas.h +++ b/src/include/ablas.h @@ -15,7 +15,7 @@ * ************************************************************************ */ /*! \file - * \brief clBLAS.h defines 'C' compatible callable functions and types that + * \brief ablas.h defines 'C' compatible callable functions and types that * call into the library * \details The minimum compiler versions the library should support * ( These compilers have solid C++11 support): @@ -25,30 +25,30 @@ */ #pragma once -#ifndef _aBLAS_H_ -#define _aBLAS_H_ +#ifndef _ABLAS_H_ +#define _ABLAS_H_ #include /*! * CMake-generated file to define export related preprocessor macros, including - * CLBLAS_EXPORT and CLBLAS_DEPRECATED + * ABLAS_EXPORT and ABLAS_DEPRECATED */ -#include "clblas_export.h" +#include "ablas_export.h" #ifdef __cplusplus extern "C" { #endif -#include "clBLAS-types.h" +#include "ablas_types.h" -/*! Define CLBLAS_USE_OPENCL to build library for OpenCL +/*! Define ABLAS_USE_OPENCL to build library for OpenCL */ -#if defined( CLBLAS_USE_OPENCL ) - #include "clBLAS-opencl.h" +#if defined( ABLAS_USE_OPENCL ) + #include "ablas_opencl.h" #else // Boltzman headers to be included here - #include "clBLAS-hsa.h" + #include "ablas_hsa.h" #endif /*! @@ -65,17 +65,17 @@ extern "C" { /**@{*/ /*! -* \brief Enable/Disable asynchronous behavior for clBLAS +* \brief Enable/Disable asynchronous behavior for ablas * -* \param[in] control A valid clsparseControl created with clblasCreateControl +* \param[in] control A valid clsparseControl created with ablasCreateControl * \param[in] async True to enable immediate return, false to block execution until event completion * * \ingroup STATE-SINGLE * -* \returns \b clblasSuccess +* \returns \b ablasSuccess */ -CLBLAS_EXPORT clblasStatus - clblasEnableAsync( clblasControl control, bool async ); +ABLAS_EXPORT ablas_status +ablas_enable_async( ablas_control control, bool async ); /**@}*/ /*! @@ -92,31 +92,80 @@ CLBLAS_EXPORT clblasStatus /**@{*/ -/*! \brief Refactored clBLAS API - * \details These pointers are not denoting arrays. The batch processing is specified inside of these - * structs with batch_size, \f$ C \leftarrow \alpha \ast A \ast B + \beta \ast C \f$ +/*! \brief Refactored ablas API + * \details Generic matrix-matrix multiplication. These pointers are not denoting arrays. The batch processing is specified inside of these + * structs with batch_size + * \f$ c \leftarrow \alpha o (a \ast b) + \beta o c \f$ + * + * operator 'o' represent the entrywise (Hadamard) product. + * scalar o scalar + * scalar o vector + * scalar o matrix + * vector o vector + * vector o matrix + * matrix o matrix + * + * The general equation can be simplified by the terms being either ZERO or IDENTITY. + * + * GEMM (L3) + * alpha - scalar, vector or matrix + * a - matrix + * b - matrix + * beta - scalar, vector or matrix + * c - matrix + * + * GEMV (L2) + * alpha - scalar or vector + * a - matrix + * b - vector + * beta - scalar or vector + * c - vector + * + * AXPY (L1) + * alpha - scalar or vector + * a - vector + * b - IDENTITY + * beta - ZERO + * c - vector + * + * SDOT (L1) + * alpha - IDENTITY + * a - vector + * b - vector + * beta - ZERO + * c - scalar + * + * SCAL (L1) + * alpha - ZERO + * a - ZERO + * b - ZERO + * beta - scalar, vector or matrix + * c - scalar, vector or matrix + * + * * \param[in] alpha Scalar value to be multiplied into the product of A * B * \param[in] a Source matrix * \param[in] b Source matrix * \param[in] beta Scalar value to be multiplied into the matrix C on read * \param[in,out] c Destination matrix - * \param[in] control clBLAS state object + * \param[in,out] control ablas state object */ -CLBLAS_EXPORT clblasStatus - clblasGemm(const clblasScalar* alpha, - const clblasMatrix* a, - const clblasMatrix* b, - const clblasScalar* beta, - clblasMatrix* c, - clblasControl control ); +ABLAS_EXPORT ablas_status +ablas_gemm( + const ablas_matrix *alpha, + const ablas_matrix *a, + const ablas_matrix *b, + const ablas_matrix *beta, + ablas_matrix *c, + ablas_control *control ); /**@}*/ -// Example of older clBLAS API from v2.x.x -// CLBLAS_DEPRECATED clblasStatus -// clblasSgemm( -// clblasOrder order, -// clblasTranspose transA, -// clblasTranspose transB, +// Example of older ablas API from v2.x.x +// ABLAS_DEPRECATED ablasStatus +// ablasSgemm( +// ablasOrder order, +// ablasTranspose transA, +// ablasTranspose transB, // size_t M, // size_t N, // size_t K, @@ -141,4 +190,4 @@ CLBLAS_EXPORT clblasStatus } // extern C #endif -#endif // _aBLAS_H_ +#endif // _ABLAS_H_ diff --git a/src/include/ablas_types.h b/src/include/ablas_types.h index 0114defc0..abb73f190 100644 --- a/src/include/ablas_types.h +++ b/src/include/ablas_types.h @@ -15,200 +15,134 @@ * ************************************************************************ */ /*! \file - * \brief clBLAS-types.h defines public types to be consummed by the library + * \brief aBLAS-types.h defines public types to be consummed by the library * The types are agnostic to the underlying runtime used by the library */ #pragma once -#ifndef _CL_BLAS_TYPES_H_ -#define _CL_BLAS_TYPES_H_ +#ifndef _ABLAS_TYPES_H_ +#define _ABLAS_TYPES_H_ /*! \brief An enumeration to describe the precision of data pointed by a * particular instance of a struct - * \remarks This impllies that clBLAS can support mixed precision operations + * \remarks This impllies that aBLAS can support mixed precision operations */ -typedef enum clblasPrecision_ { - clblasSingleReal, - clblasDoubleReal, - clblasSingleComplex, - clblasDoubleComplex, -} clblasPrecision; +typedef enum ablas_precision_ { + ablas_single_real, + ablas_double_real, + ablas_single_complex, + ablas_double_complex, +} ablas_precision; /*! \brief Used by the Hermitian, symmetric and triangular matrix * routines to specify whether the upper or lower triangle is being referenced. */ -typedef enum clblasUplo_ { - clblasUpper, /**< Upper triangle. */ - clblasLower /**< Lower triangle. */ -} clblasUplo; +typedef enum ablas_uplo_ { + ablas_upper, /**< Upper triangle. */ + ablas_lower /**< Lower triangle. */ +} ablas_uplo; /*! \brief It is used by the triangular matrix routines to specify whether the * matrix is unit triangular. */ -typedef enum clblasDiag_ { - clblasUnit, /**< Unit triangular. */ - clblasNonUnit /**< Non-unit triangular. */ -} clblasDiag; +typedef enum ablas_diag_ { + ablas_unit, /**< Unit triangular. */ + ablas_non_unit /**< Non-unit triangular. */ +} ablas_diag; /*! \brief Indicates the side matrix A is located relative to matrix B during multiplication. */ -typedef enum clblasSide_ { - clblasLeft, /**< Multiply general matrix by symmetric, +typedef enum ablas_side_ { + ablas_left, /**< Multiply general matrix by symmetric, Hermitian or triangular matrix on the left. */ - clblasRight /**< Multiply general matrix by symmetric, + ablas_right /**< Multiply general matrix by symmetric, Hermitian or triangular matrix on the right. */ -} clblasSide; +} ablas_side; -/*! \brief Structure to encapsulate scalar data to clBLAS API - * \details This stores data in a struct of arrays (SoA) model. This should help performance - * for batched operation, and gracefully become a 'normal' struct when batch_size == 1 - * \note It is the users responsibility to allocate/deallocate OpenCL buffers - */ -typedef struct clblasScalar_ -{ - /*! Polymorphic pointer for the library. If clBLAS is compiled with BUILD_CLVERSION < 200, - * value will be will be treated as allocated with clCreateBuffer(). If - * BUILD_CLVERSION >= 200 then this will be treated as allocated with clSVMalloc() - */ - void* device_scalar; - - /*! This describes the precision of the data pointed to by value - */ - clblasPrecision precision; - - /*! This offset is added to the cl_mem locations on device to define beginning of the data in the cl_mem buffers - */ - size_t offset; - - /*! This is the number of scalar values stored in the value buffer - */ - size_t batch_size; - - /*! This is the distance between scalars in value; batch_stride >= 1 - * Packed scalars would have a batch_stride of 1 - */ - size_t batch_stride; -} clblasScalar; - -/*! \brief Structure to encapsulate dense vector data to clBLAS API - * \details This stores data in a struct of arrays (SoA) model. This should help performance - * for batched operation, and gracefully become a 'normal' struct when batch_size == 1 - * \note It is the users responsibility to allocate/deallocate OpenCL buffers - */ -typedef struct clblasVector_ -{ - /*! Polymorphic pointer for the library. If clBLAS is compiled with BUILD_CLVERSION < 200, - * value will be will be treated as a pointer allocated with clCreateBuffer(). If - * BUILD_CLVERSION >= 200 then this will be treated as a pointer allocated with clSVMalloc() - */ - void* device_vector; - - /*! This describes the precision of the data pointed to by value - */ - clblasPrecision precision; - - /*! \brief This offset is added to the cl_mem location on device to define beginning of the data in the cl_mem buffers - * \details Usually used to define the start a smaller subvector in larger vector allocation block. - * This same offset is applied to every vector in a batch - */ - size_t offset; - - size_t length; /*!< Length of an individual vector */ - - /*! \brief Stride to consecutive elements in vector - * \details For packed vectors, stride == 1 - */ - size_t stride; - - /*! This is the number of vectors stored in the values buffer; a single vector would have batch_size == 1 - */ - size_t batch_size; - - /*! This is the distance between vectors in values; batch_stride >= num_values - * Packed vectors would have a batch_stride == num_values - */ - size_t batch_stride; -} clblasVector; - -/*! \brief Structure to encapsulate dense matrix data to clBLAS API - * \details This stores data in a struct of arrays (SoA) model. This should help performance - * for batched operation, and gracefully become a 'normal' struct when batch_size == 1 +/*! \brief Structure to encapsulate dense matrix/vector/scalar data to aBLAS API. + * \details Able to store multiple matrices (or vectors, scalars) + * to facilitate high-performance batched oprations; + * gracefully becomes a 'normal' matrix when num_matrices == 1. * \verbatim - clBlas V2: Given a column major matrix with M, N, lda, nontranspose - This matrix is represented in clBlas V3 as: + clBLAS V2: Given a column major matrix with M, N, lda, nontranspose + This matrix is represented in aBLAS as: num_rows = M num_cols = N row_stride = 1 col_stride = ldX + num_matrices = 1 \endverbatim - - * \note It is the users responsibility to allocate/deallocate OpenCL buffers + * aBLAS API represents scalars as ablas_matrix with num_rows = 1 and num_cols = 1. + * aBLAS API represents vectors as ablas_matrix with num_rows = 1 or num_cols = 1. + * + * \note It is the users responsibility to allocate/deallocate buffers * \note Traditional matrix fields not explicitely represented within this structure * \li \b transpose layout * \li \b row/column major layout * \attention There has been significant debate about changing the matrix meta data below from host scalar values - * into batched scalar values by changing their types to clblasScalar. The advantage is that we + * into batched scalar values by changing their types to ablasScalar. The advantage is that we * could then process batched matricies of arbitrary row, column and stride values. The problem is * that both host and device need access to this data, which would introduce mapping calls. The host needs * the data to figure out how to form launch parameters, and the device needs access to be able to * handle matrix tail cases properly. This may have reasonable performance on APU's, but the performance * impact on discrete devices could be significant. For now, we keep the num_rows, num_cols and strides as a size_t on host */ -typedef struct clblasMatrix_ -{ - /*! \brief An OpenCL buffer that holds the values of all the matrix data - * \details Polymorphic pointer for the library. If clBLAS is compiled with BUILD_CLVERSION < 200, +typedef struct ablas_matrix_ { + + /*! \brief Buffer that holds the matrix data. + * \details Polymorphic pointer for the library. If aBLAS is compiled with BUILD_CLVERSION < 200, * value will be will be treated as a pointer allocated with clCreateBuffer(). If * BUILD_CLVERSION >= 200 then this will be treated as a pointer allocated with clSVMalloc() * For batched matrices, this buffer contains the packed values of all the matrices. */ - void* device_matrix; + void* data; - /*! This describes the precision of the data pointed to by value + /*! Precision of the data. */ - clblasPrecision precision; + ablas_precision precision; /*! \brief This offset is added to the cl_mem location on device to define beginning of the data in the cl_mem buffers * \details Usually used to define the start a smaller submatrix in larger matrix allocation block. * This same offset is applied to every matrix in a batch */ - size_t offset_values; + size_t offset; - /*! \brief Number of elements in a matrix row + /*! \brief Number of rows in each matrix. * \details For batched matrices, this is a constant property of each 'matrix', where each matrix has the same number * of rows */ size_t num_rows; - /*! \brief Number of elements in a matrix column + /*! \brief Number of columns in each matrix. * \details For batched matrices, this is a constant property of each 'matrix', where each matrix has the same number * of columns */ size_t num_cols; - /*! \brief Stride to consecutive elements in a matrix row - * \details For an example of a packed matrices, in 'C' family row major - * languages this is num_rows, for Fortran family languages this is 1 + /*! Number of matrices stored in the buffer; a single matrix would have num_matrices == 1. + * \pre num_matrices > 0 + */ + size_t num_matrices; + + /*! \brief Stride to consecutive rows in each matrix. + * \details ptr += row_stride would point to same column, same matrix, next row. + * For column-major matrix, row_stride = 1. */ size_t row_stride; - /*! \brief Stride to consecutive elements in a matrix column - * \details For an example of a packed matrices, in 'C' family row major - * languages this is 1, for Fortran family languages this is num_rows + /*! \brief Stride to consecutive columns in each matrix. + * \details ptr += col_stride would point to same row, same matrix, next column. + * For row-major matrix, col_stride = 1. */ size_t col_stride; - /*! This is the number of vectors stored in the values buffer; a single vector would have batch_size == 1 - * \pre batch_size > 0 - */ - size_t batch_size; - - /*! This is the distance between the start of matrices in values + /*! \bried Stride to consectutive matrices. + * \details ptr += matrix_stride would point to same row, same column, next matrix * \pre row_major: batch_stride >= num_rows * row_stride * \pre column_major: batch_stride >= num_cols * col_stride */ - size_t batch_stride; + size_t matrix_stride; -} clblasMatrix; +} ablas_matrix; #endif