From ba47c7f4f301aad100ed166de338b86e01da8465 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 16 Jul 2024 15:57:24 -0500 Subject: [PATCH 1/3] Vectorize reduction stage of sgemv_t. --- kernel/power/sgemv_t.c | 54 ++++++++++++++++++++++++++++++---------- kernel/power/sgemv_t_8.c | 54 ++++++++++++++++++++++++++++++---------- 2 files changed, 82 insertions(+), 26 deletions(-) diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index c3fc8e77a1..e133c815c3 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -79,15 +79,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t0 = vec_mergeh(temp4, temp6); + t1 = vec_mergel(temp4, temp6); + t2 = vec_mergeh(temp5, temp7); + t3 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t0, t2); + temp5 = vec_mergel(t0, t2); + temp6 = vec_mergeh(t1, t3); + temp7 = vec_mergel(t1, t3); + temp4 += temp5 + temp6 + temp7; + + v_y[0] += a * temp0; + v_y[1] += a * temp4; } @@ -116,10 +133,21 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 += v_x[i] * va3[i]; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] += a * temp0; } diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index 1ee7c8aebb..f21f6eb7d2 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -100,15 +100,32 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t0 = vec_mergeh(temp4, temp6); + t1 = vec_mergel(temp4, temp6); + t2 = vec_mergeh(temp5, temp7); + t3 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t0, t2); + temp5 = vec_mergel(t0, t2); + temp6 = vec_mergeh(t1, t3); + temp7 = vec_mergel(t1, t3); + temp4 += temp5 + temp6 + temp7; + + v_y[0] += a * temp0; + v_y[1] += a * temp4; } @@ -137,10 +154,21 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] += a * temp0; } From 66622de36d0b30161fcfbbf1ad22007f654efa4d Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 19 Jul 2024 07:26:08 -0500 Subject: [PATCH 2/3] Hack: Test gemv vs gemm. --- interface/gemm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 4537b6a78f..e31e22d241 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -47,22 +47,29 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " +#define GEMV BLASFUNC(qgemv) #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " +#define GEMV BLASFUNC(sbgemv) #else #define ERROR_NAME "SGEMM " +#define GEMV BLASFUNC(sgemv) #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " +#define GEMV BLASFUNC(xgemv) #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM " +#define GEMV BLASFUNC(zgemv) #else #define ERROR_NAME "CGEMM " +#define GEMV BLASFUNC(cgemv) #endif #else #ifdef XDOUBLE @@ -190,6 +197,16 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if 1 + if (*N == 1) { + GEMV(TRANSA, K, M, alpha, a, ldA, b, N, beta, c, N); +//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) + return; + } +#endif +#endif + #ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) From e2334d02180c5bc24592ab4fe65aa109da19179b Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 1 Aug 2024 14:44:40 -0500 Subject: [PATCH 3/3] Remove GEMV hack. --- interface/gemm.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index e31e22d241..4537b6a78f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -47,29 +47,22 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " -#define GEMV BLASFUNC(qgemv) #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " -#define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " -#define GEMV BLASFUNC(sbgemv) #else #define ERROR_NAME "SGEMM " -#define GEMV BLASFUNC(sgemv) #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " -#define GEMV BLASFUNC(xgemv) #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM " -#define GEMV BLASFUNC(zgemv) #else #define ERROR_NAME "CGEMM " -#define GEMV BLASFUNC(cgemv) #endif #else #ifdef XDOUBLE @@ -197,16 +190,6 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) -#if 1 - if (*N == 1) { - GEMV(TRANSA, K, M, alpha, a, ldA, b, N, beta, c, N); -//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) - return; - } -#endif -#endif - #ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)