Performance regression in version 0.3.28 on Graviton4 #4939

dnoan · 2024-10-15T06:42:32Z

Recently I reported a performance regression at a4e56e0 There is some glitch in GitHub which prevents me from posting a follow up so I am opening this ticket.

In my case the app makes calls to DGEMM with 86100 different inputs. I picked some of what appears to be the most common calls:

DGEMM TRANSA: T, TRANSB: N, M: 45, N: 1, K: 211, ALPHA: -1.000000, LDA: 256, LDB: 256, BETA: 1.000000, LDC: 5383
DGEMM TRANSA: T, TRANSB: N, M: 23, N: 1, K: 117, ALPHA: -1.000000, LDA: 140, LDB: 140, BETA: 1.000000, LDC: 5383
DGEMM TRANSA: T, TRANSB: N, M: 211, N: 1, K: 45, ALPHA: -1.000000, LDA: 45, LDB: 45, BETA: 1.000000, LDC: 211
DGEMM TRANSA: T, TRANSB: N, M: 117, N: 1, K: 23, ALPHA: -1.000000, LDA: 23, LDB: 23, BETA: 1.000000, LDC: 117
DGEMM TRANSA: T, TRANSB: N, M: 9, N: 1, K: 75, ALPHA: -1.000000, LDA: 84, LDB: 84, BETA: 1.000000, LDC: 5383
DGEMM TRANSA: T, TRANSB: N, M: 98, N: 1, K: 17, ALPHA: -1.000000, LDA: 17, LDB: 17, BETA: 1.000000, LDC: 98
DGEMM TRANSA: T, TRANSB: N, M: 94, N: 1, K: 56, ALPHA: -1.000000, LDA: 56, LDB: 56, BETA: 1.000000, LDC: 94

DGEMM TRANSA: N, TRANSB: N, M: 5, N: 5, K: 1, ALPHA: -1.000000, LDA: 5, LDB: 301, BETA: 1.000000, LDC: 301
DGEMM TRANSA: N, TRANSB: N, M: 8, N: 8, K: 1, ALPHA: -1.000000, LDA: 8, LDB: 301, BETA: 1.000000, LDC: 301
DGEMM TRANSA: N, TRANSB: N, M: 33, N: 20, K: 1, ALPHA: -1.000000, LDA: 33, LDB: 301, BETA: 1.000000, LDC: 301
DGEMM TRANSA: N, TRANSB: N, M: 69, N: 24, K: 1, ALPHA: -1.000000, LDA: 69, LDB: 301, BETA: 1.000000, LDC: 301
DGEMM TRANSA: N, TRANSB: N, M: 210, N: 15, K: 1, ALPHA: -1.000000, LDA: 210, LDB: 211, BETA: 1.000000, LDC: 211
DGEMM TRANSA: N, TRANSB: N, M: 196, N: 1, K: 1, ALPHA: -1.000000, LDA: 196, LDB: 211, BETA: 1.000000, LDC: 211
DGEMM TRANSA: N, TRANSB: N, M: 211, N: 211, K: 45, ALPHA: -1.000000, LDA: 256, LDB: 256, BETA: 1.000000, LDC: 256

The text was updated successfully, but these errors were encountered:

Mousius · 2024-10-15T10:44:57Z

Hi @dnoan,

That is unfortunate, what's interesting is that many of these are N=1, which means they should go through GEMV, not GEMM:

OpenBLAS/interface/gemm.c

Lines 501 to 545 in 8483a71

    
           #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) 
        
             // Check if we can convert GEMM -> GEMV 
        
             if (args.k != 0) { 
        
               if (args.n == 1) { 
        
                 blasint inc_x = 1; 
        
                 blasint inc_y = 1; 
        
                 // These were passed in as blasint, but the struct translates them to blaslong 
        
                 blasint m = args.m; 
        
                 blasint n = args.k; 
        
                 blasint lda = args.lda; 
        
                 // Create new transpose parameters 
        
                 char NT = 'N'; 
        
                 if (transa & 1) { 
        
                   NT = 'T'; 
        
                   m = args.k; 
        
                   n = args.m; 
        
                 } 
        
                 if (transb & 1) { 
        
                   inc_x = args.ldb; 
        
                 } 
        
                 GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); 
        
                 return; 
        
               } 
        
               if (args.m == 1) { 
        
                 blasint inc_x = args.lda; 
        
                 blasint inc_y = args.ldc; 
        
                 // These were passed in as blasint, but the struct translates them to blaslong 
        
                 blasint m = args.k; 
        
                 blasint n = args.n; 
        
                 blasint ldb = args.ldb; 
        
                 // Create new transpose parameters 
        
                 char NT = 'T'; 
        
                 if (transa & 1) { 
        
                   inc_x = 1; 
        
                 } 
        
                 if (transb & 1) { 
        
                   NT = 'N'; 
        
                   m = args.n; 
        
                   n = args.k; 
        
                 } 
        
                 GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); 
        
                 return; 
        
               } 
        
             } 
        
           #endif

Whilst the patch you're indicating is triggered later, after checking the small GEMM permit:

OpenBLAS/interface/gemm.c

Lines 551 to 571 in 8483a71

    
           #if USE_SMALL_MATRIX_OPT 
        
           #if !defined(COMPLEX) 
        
             if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ 
        
           	  if(*(FLOAT *)(args.beta) == 0.0){ 
        
           		(GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); 
        
           	  }else{ 
        
           		(GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); 
        
           	  } 
        
           	  return; 
        
             } 
        
           #else 
        
             if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ 
        
           	  if(beta[0] == 0.0 && beta[1] == 0.0){ 
        
           		(ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); 
        
           	  }else{ 
        
           		(ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); 
        
           	  } 
        
           	  return; 
        
             } 
        
           #endif 
        
           #endif

The dgemm permit for small gemm is conservative (M*N*K <= 64*64*64):
https://github.com/OpenMathLib/OpenBLAS/blob/8483a71169bac112db133d45b39d4def812f81b6/kernel/arm64/gemm_small_kernel_permit_sve.c#L35C14-L35C22

This would indicate the last line (M:211, N:211, K:45) would not trigger the small kernels either.

That leaves the K=1 path; could you modify the permit so that you do not take the small matrices for that path and see if your performance improves?

Could you also let me know what compiler you're using?

martin-frbg · 2024-10-15T12:43:01Z

This could indeed be an unintentional downgrade caused by switching from SVE GEMM to NEON GEMV (the SVE implementation of the latter - from #4803 - only being available on A64FX right now)

martin-frbg · 2024-10-15T12:56:28Z

might be worthwile to copy kernel/arm64/KERNEL.A64FX to kernel/arm64/KERNEL.NEOVERSEV2 and rebuild/retest

dnoan · 2024-10-16T11:31:07Z

Copying KERNEL.A64FX to KERNEL.NEOVERSEV2 didn't improve performance. Actually the workload ran marginally slower.

@Mousius, can provide a diff so that I don't mess things up?

martin-frbg · 2024-10-16T11:53:39Z

guess that would be something like

--- a/kernel/arm64/gemm_small_kernel_permit_sve.c
+++ b/kernel/arm64/gemm_small_kernel_permit_sve.c
@@ -32,7 +32,7 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
   BLASLONG MNK = M * N * K;
 
 #if defined(DOUBLE) // dgemm
-  if (MNK <= 64*64*64)
+  if (MNK <= 64*64*64 && K > 1)
     return 1;
 #else // sgemm
   if (MNK <= 64*64*64)

dnoan · 2024-10-22T06:50:28Z

I tried with K > 1 but it didn't help. I also tried different values for MNK <= 64*64*64 with values 8,16,32,128 besides the default 64 and there was no significant difference.

martin-frbg · 2024-10-22T07:40:29Z

Thank you. But you're certain that it was a4e56e0 that brought the slowdown, or is this just an educated guess ? (Does speed return to normal when you simply return 0 for any value there ?)

dnoan · 2024-10-24T13:56:41Z

I bisected the commits between 0.3.27 and 0.3.28.

The following had no effect:

--- a/kernel/arm64/gemm_small_kernel_permit_sve.c
+++ b/kernel/arm64/gemm_small_kernel_permit_sve.c
@@ -32,8 +32,7 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
   BLASLONG MNK = M * N * K;

 #if defined(DOUBLE) // dgemm
-  if (MNK <= 64*64*64)
-    return 1;
+    return 0;
 #else // sgemm
   if (MNK <= 64*64*64)
     return 1;

The following brought performance back to normal:

--- a/Makefile.system
+++ b/Makefile.system
@@ -268,8 +268,6 @@ SMALL_MATRIX_OPT = 1
 else ifeq ($(ARCH), power)
 SMALL_MATRIX_OPT = 1
 BUILD_BFLOAT16 = 1
-else ifeq ($(ARCH), arm64)
-SMALL_MATRIX_OPT = 1
 endif
 ifeq ($(ARCH), loongarch64)
 SMALL_MATRIX_OPT = 1

martin-frbg · 2024-10-24T14:35:26Z

Thanks - this is surprising, I would have expected the two to be equivalent (unless I'm missing something - the check for "small size" itself can't be that expensive ??)

dnoan · 2024-10-24T14:50:49Z

I removed the check completely and it brought the worst result ever:

--- a/kernel/arm64/gemm_small_kernel_permit_sve.c
+++ b/kernel/arm64/gemm_small_kernel_permit_sve.c
@@ -29,15 +29,5 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
 {
-  BLASLONG MNK = M * N * K;
-
-#if defined(DOUBLE) // dgemm
-  if (MNK <= 64*64*64)
-    return 1;
-#else // sgemm
-  if (MNK <= 64*64*64)
-    return 1;
-#endif
-
   return 0;
 }

dnoan · 2024-10-24T14:59:40Z

By the way, I don't know if the performance issue is in DGEMM. Could it be that SMALL_MATRIX_OPT triggers a different code path somewhere else?

martin-frbg · 2024-10-24T15:01:24Z

hmmm. for that to be the worst, your code must be doing lots of SGEMM calls as well (?) and that simple call to gemm_small_kernel_permit must be causing a cache flush or something.
At least SMALL_MATRIX_OPT is not supposed to be triggering anything else ...(besides me now)

martin-frbg · 2024-10-24T15:35:26Z

...definitely no other code paths affected besides GEMM (and the directly related GEMM_BATCH). Also "small matrix" is checked only after the possible branching into GEMV, so most of the representative DGEMM calls you picked out should never even get that far down into interface/gemm.c (as mentioned by Mousius in his very first comment)

Mousius · 2024-10-24T16:07:21Z

Maybe I missed it, but how much worse is the performance with the return 0 permit compared to 0.3.27? There has to be some overhead for small GEMM, which the faster kernel would hopefully overcome. I'm curious about the performance drop for that check in your use case.

dnoan · 2024-10-25T13:41:29Z

I rebuilt and reran everything. The example workload makes mostly LAPACK calls but even some BLAS calls, including DGEMM. Testing was done on c8g.metal-24xl. I deleted the entire project after each build and test run and then started from scratch.

Version 0.3.27 - 250 seconds
Version 0.3.28 - 275 seconds
Version 0.3.28 with SMALL_MATRIX_OPT removed from Makefile.system - 252 seconds
Version 0.3.28 - returning 0 for DGEMM - 264 seconds:

int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
  BLASLONG MNK = M * N * K;

#if defined(DOUBLE) // dgemm
  if (MNK <= 64*64*64)
    return 0;
#else // sgemm
  if (MNK <= 64*64*64)
    return 1;
#endif

  return 0;
}

Version 0.3.28 - returning 0 for DGEMM and moving the initialization of BLASLONG to the SGEMM section - 265 seconds:

int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
#if defined(DOUBLE) // dgemm
    return 0;
#else // sgemm
  BLASLONG MNK = M * N * K;

  if (MNK <= 64*64*64)
    return 1;
#endif

  return 0;
}

Version 0.3.28 - returning 0 from CNAME - 264 seconds:

int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
  return 0;
}

Mousius · 2024-10-26T14:22:39Z

Thanks, @dnoan. This is very helpful. It's interesting to see the ~5% drop when just enabling small matrix optimisation.

When calling dgemm, what are your transpose settings? I'd like to quickly write an alternative kernel for you to try 😸

dnoan · 2024-10-30T07:26:11Z

In this case transpose is TN and NN.

Mousius · 2024-11-04T11:45:01Z

In this case transpose is TN and NN.

Thanks! I've put some experimental ASIMD kernels in #4963. Can you give them a try and see whether they perform better? I hypothesise that using ASIMD may be slightly more efficient than SVE for the 128-bit vector length for such small sizes.

dnoan · 2024-11-08T12:41:27Z

There is no difference in performance compared to 0.3.28. I don't know how OpenBLAS works internally, but I can't see anywhere in the diff that we are switching from the SVE to the SIMD kernels.

Mousius · 2024-11-08T12:44:13Z

I've commented out the SVE kernels and added ASIMD ones for ARMV8SVE target here:
https://github.com/OpenMathLib/OpenBLAS/pull/4963/files#diff-fb8d5777e1eda9b02b7e71a3fdcba2f0fe05ac790c0cc9437fda31ba8e71b12e

And added them explicitly to NEOVERSEV2 target here:
https://github.com/OpenMathLib/OpenBLAS/pull/4963/files#diff-668e5b5408832c20baf0ef0798427647b961a3cc34e9b38e7cb653b5b209a39e

There is potential I got this wrong, it seems odd you're seeing no performance change 🤔

martin-frbg · 2024-11-08T13:14:08Z

I'm beginning to suspect a problem with cpu autodetection - @dnoan, I assume you are building with DYNAMIC_ARCH=1 or not ?

dnoan · 2024-11-08T15:45:45Z

Yes, my settings are:

TARGET = ARMV8
DYNAMIC_ARCH = 1
USE_THREAD = 1
USE_LOCKING = 1
USE_OPENMP = 1
NUM_THREADS = 256
BUFFERSIZE = 22

Mousius · 2024-11-13T11:52:38Z

@dnoan, what compiler are you using? If it's an older compiler or mis-detected, it might not enable the SVE platforms.

dnoan · 2024-11-13T14:16:00Z

I am compiling with GCC 13.2.

cdaley mentioned this issue Oct 23, 2024

Performance regression in version 0.3.28 on aarch64 because of GEMM to GEMV transformation #4951

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Performance regression in version 0.3.28 on Graviton4 #4939

Performance regression in version 0.3.28 on Graviton4 #4939

dnoan commented Oct 15, 2024

Mousius commented Oct 15, 2024

martin-frbg commented Oct 15, 2024 •

edited

Loading

martin-frbg commented Oct 15, 2024

dnoan commented Oct 16, 2024

martin-frbg commented Oct 16, 2024

dnoan commented Oct 22, 2024

martin-frbg commented Oct 22, 2024

dnoan commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

dnoan commented Oct 24, 2024

dnoan commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

Mousius commented Oct 24, 2024

dnoan commented Oct 25, 2024

Mousius commented Oct 26, 2024

dnoan commented Oct 30, 2024

Mousius commented Nov 4, 2024

dnoan commented Nov 8, 2024

Mousius commented Nov 8, 2024

martin-frbg commented Nov 8, 2024

dnoan commented Nov 8, 2024

Mousius commented Nov 13, 2024

dnoan commented Nov 13, 2024

Performance regression in version 0.3.28 on Graviton4 #4939

Performance regression in version 0.3.28 on Graviton4 #4939

Comments

dnoan commented Oct 15, 2024

Mousius commented Oct 15, 2024

martin-frbg commented Oct 15, 2024 • edited Loading

martin-frbg commented Oct 15, 2024

dnoan commented Oct 16, 2024

martin-frbg commented Oct 16, 2024

dnoan commented Oct 22, 2024

martin-frbg commented Oct 22, 2024

dnoan commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

dnoan commented Oct 24, 2024

dnoan commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

martin-frbg commented Oct 24, 2024

Mousius commented Oct 24, 2024

dnoan commented Oct 25, 2024

Mousius commented Oct 26, 2024

dnoan commented Oct 30, 2024

Mousius commented Nov 4, 2024

dnoan commented Nov 8, 2024

Mousius commented Nov 8, 2024

martin-frbg commented Nov 8, 2024

dnoan commented Nov 8, 2024

Mousius commented Nov 13, 2024

dnoan commented Nov 13, 2024

martin-frbg commented Oct 15, 2024 •

edited

Loading