Skip to content

Commit

Permalink
Add small marix optimization kernel interface.
Browse files Browse the repository at this point in the history
make SMALL_MATRIX_OPT=1
  • Loading branch information
xianyi committed Apr 28, 2020
1 parent 57549f5 commit aae6af9
Show file tree
Hide file tree
Showing 11 changed files with 340 additions and 1 deletion.
5 changes: 5 additions & 0 deletions Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ else
ONLY_CBLAS = 0
endif

#For small matrix optimization
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
endif

# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
Expand Down
6 changes: 6 additions & 0 deletions common_d.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@
#define DIMATCOPY_K_RT dimatcopy_k_rt
#define DGEADD_K dgeadd_k


#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn
#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt
#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn
#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt

#else

#define DAMAX_K gotoblas -> damax_k
Expand Down
12 changes: 12 additions & 0 deletions common_level3.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,18 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
#endif

#ifdef SMALL_MATRIX_OPT
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);

int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
#endif

int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
Expand Down
16 changes: 16 additions & 0 deletions common_macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,11 @@

#define GEADD_K DGEADD_K

#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT

#elif defined(HALF)

#define AMAX_K SAMAX_K
Expand Down Expand Up @@ -923,6 +928,11 @@

#define GEADD_K SGEADD_K

#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT

#endif

#else
Expand Down Expand Up @@ -1228,6 +1238,12 @@
#define IMATCOPY_K_RT SIMATCOPY_K_RT

#define GEADD_K SGEADD_K

#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT

#endif
#else
#ifdef XDOUBLE
Expand Down
5 changes: 5 additions & 0 deletions common_s.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@

#define SGEADD_K sgeadd_k

#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn
#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt
#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn
#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt

#else

#define SAMAX_K gotoblas -> samax_k
Expand Down
28 changes: 27 additions & 1 deletion interface/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,18 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
#endif
};

#ifdef SMALL_MATRIX_OPT
//Only support s/dgemm small matrix optimiztion so far.
static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
#ifndef COMPLEX
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
#endif
#endif
};
#endif

#ifndef CBLAS

void NAME(char *TRANSA, char *TRANSB,
Expand Down Expand Up @@ -411,6 +423,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

FUNCTION_PROFILE_START();

MNK = (double) args.m * (double) args.n * (double) args.k;

#ifdef SMALL_MATRIX_OPT
#if !defined(COMPLEX)
//need to tune small matrices cases.
if(MNK <= 100.0*100.0*100.0){
(gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b,
args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
return;
}
#endif
#endif


buffer = (XFLOAT *)blas_memory_alloc(0);

sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
Expand All @@ -420,7 +446,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);

MNK = (double) args.m * (double) args.n * (double) args.k;

if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else
Expand Down
73 changes: 73 additions & 0 deletions kernel/Makefile.L3
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,19 @@ XBLASOBJS += \

endif

###### BLAS small matrix optimization #####
ifeq ($(SMALL_MATRIX_OPT), 1)

SBLASOBJS += \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)

DBLASOBJS += \
dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)

endif

###### BLAS extensions #####
SBLASOBJS += \
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
Expand Down Expand Up @@ -4075,3 +4088,63 @@ endif
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@



###### BLAS small matrix optimization #####

ifndef DGEMM_SAMLL_K_NN
DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif

ifndef DGEMM_SAMLL_K_NT
DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif

ifndef DGEMM_SAMLL_K_TN
DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif

ifndef DGEMM_SAMLL_K_TT
DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif

$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@

$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@

$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@

$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@


ifndef SGEMM_SAMLL_K_NN
SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif

ifndef SGEMM_SAMLL_K_NT
SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif

ifndef SGEMM_SAMLL_K_TN
SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif

ifndef SGEMM_SAMLL_K_TT
SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif

$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
49 changes: 49 additions & 0 deletions kernel/generic/gemm_small_matrix_kernel_nn.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major

BLASLONG i,j,k;
FLOAT result=0.0;

for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}

return 0;
}
49 changes: 49 additions & 0 deletions kernel/generic/gemm_small_matrix_kernel_nt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major

BLASLONG i,j,k;
FLOAT result=0.0;

for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k*ldb+j];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}

return 0;
}
49 changes: 49 additions & 0 deletions kernel/generic/gemm_small_matrix_kernel_tn.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major

BLASLONG i,j,k;
FLOAT result=0.0;

for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}

return 0;
}
Loading

1 comment on commit aae6af9

@martin-frbg
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beware some typos in Makefile.L3 (SAMLL <-> SMALL)

Please sign in to comment.