use the best experiment for max tflops (#32)

EleutherAI · Mar 7, 2024 · 233f003 · 233f003
1 parent 56aeee1
commit 233f003
Showing 1 changed file with 8 additions and 8 deletions.
diff --git a/benchmarks/sizing/utils.py b/benchmarks/sizing/utils.py
@@ -72,7 +72,7 @@ def benchmark_mm(m, n, k, num_iterations, num_warmup_iterations):
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {m}x{n}x{k}: {elapsed_time:.3f}")
     print(f"Throughput (in TFLOP/s) for {m}x{n}x{k}: {(2 * m * n * k) / (elapsed_time * 10**12):.3f}")
     print("-" * 80)
@@ -99,7 +99,7 @@ def benchmark_mm_b(m, n, k, label, b, num_iterations,num_warmup_iterations):
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({m}x{n}x{k}, b={b}): {elapsed_time :.4f}")
     print(f"Throughput (in TFLOP/s) for {label} ({m}x{n}x{k}, b={b}): "
           f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
@@ -120,7 +120,7 @@ def benchmark_bmm(b, m, n, k, label,num_iterations, num_warmup_iterations):
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({b}x{m}x{n}x{k}): {elapsed_time :.4f}")
     print(f"Throughput (in TFLOP/s) for {label} ({b}x{m}x{n}x{k}): "
           f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}")
@@ -141,7 +141,7 @@ def benchmark_dropout(A_dim, label, num_iterations, num_warmup_iterations):
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
     return elapsed_time
 
@@ -166,7 +166,7 @@ def benchmark_softmax(scores_shape, seq_length, label, num_iterations,num_warmup
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({display(scores_shape)}): {elapsed_time :.4f}")
     return elapsed_time
 
@@ -184,7 +184,7 @@ def benchmark_fused_gelu(A_dim, b_dim, label, num_iterations, num_warmup_iterati
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
     return elapsed_time
 
@@ -202,7 +202,7 @@ def benchmark_layer_norm(A_dim, normalized_shape, label, num_iterations, num_war
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}")
     return elapsed_time
 
@@ -221,6 +221,6 @@ def benchmark_add_bias_dropout(shape, label, num_iterations, num_warmup_iteratio
         torch.cuda.synchronize()
         times[i] = start.elapsed_time(end)
     times = times[num_warmup_iterations:]
-    elapsed_time = np.amax(times)/1000 
+    elapsed_time = np.amin(times)/1000 
     print(f"Elapsed time for {label} ({display(shape)}): {elapsed_time :.4f}")
     return elapsed_time