diff --git a/benchmarks/sizing/utils.py b/benchmarks/sizing/utils.py index 70e197b..59f85ce 100644 --- a/benchmarks/sizing/utils.py +++ b/benchmarks/sizing/utils.py @@ -72,7 +72,7 @@ def benchmark_mm(m, n, k, num_iterations, num_warmup_iterations): torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {m}x{n}x{k}: {elapsed_time:.3f}") print(f"Throughput (in TFLOP/s) for {m}x{n}x{k}: {(2 * m * n * k) / (elapsed_time * 10**12):.3f}") print("-" * 80) @@ -99,7 +99,7 @@ def benchmark_mm_b(m, n, k, label, b, num_iterations,num_warmup_iterations): torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({m}x{n}x{k}, b={b}): {elapsed_time :.4f}") print(f"Throughput (in TFLOP/s) for {label} ({m}x{n}x{k}, b={b}): " f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}") @@ -120,7 +120,7 @@ def benchmark_bmm(b, m, n, k, label,num_iterations, num_warmup_iterations): torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({b}x{m}x{n}x{k}): {elapsed_time :.4f}") print(f"Throughput (in TFLOP/s) for {label} ({b}x{m}x{n}x{k}): " f"{(2 * b * m * n * k) / (elapsed_time * 10**12):.3f}") @@ -141,7 +141,7 @@ def benchmark_dropout(A_dim, label, num_iterations, num_warmup_iterations): torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}") return elapsed_time @@ -166,7 +166,7 @@ def benchmark_softmax(scores_shape, seq_length, label, num_iterations,num_warmup torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({display(scores_shape)}): {elapsed_time :.4f}") return elapsed_time @@ -184,7 +184,7 @@ def benchmark_fused_gelu(A_dim, b_dim, label, num_iterations, num_warmup_iterati torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}") return elapsed_time @@ -202,7 +202,7 @@ def benchmark_layer_norm(A_dim, normalized_shape, label, num_iterations, num_war torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({display(A_dim)}): {elapsed_time :.4f}") return elapsed_time @@ -221,6 +221,6 @@ def benchmark_add_bias_dropout(shape, label, num_iterations, num_warmup_iteratio torch.cuda.synchronize() times[i] = start.elapsed_time(end) times = times[num_warmup_iterations:] - elapsed_time = np.amax(times)/1000 + elapsed_time = np.amin(times)/1000 print(f"Elapsed time for {label} ({display(shape)}): {elapsed_time :.4f}") return elapsed_time