diff --git a/benchmark.py b/benchmark.py index e93b21e..f51a6d8 100644 --- a/benchmark.py +++ b/benchmark.py @@ -22,7 +22,7 @@ num_trails = args.num_trails -def flops_benchmark(device): +def flops_benchmark(): test_range = 2 ** np.arange(8, 13, 0.25) print('size, elapsed_time, flops') @@ -57,9 +57,11 @@ def synchronize(device): pass -def memory_bandwidth_benchmark(device): +def memory_bandwidth_benchmark(from_device=device): + from_device = torch.device(from_device) test_range = 2 ** (np.arange(20, 27, 0.5)) + print(f'measuring bw from {from_device} to {device}') print('size (GB), elapsed_time, bandwidth') for size in test_range: elapsed_time = 0 @@ -68,7 +70,7 @@ def memory_bandwidth_benchmark(device): # Create random tensors a = torch.rand(size, device=device) - b = torch.rand(size, device=device) + b = torch.rand(size, device=from_device) # Warm-up to ensure CUDA kernel is initialized if using GPU synchronize(device) @@ -93,7 +95,12 @@ def memory_bandwidth_benchmark(device): elapsed_time = elapsed_time / num_trails # Calculate Bandwidth in GB/s bytes_copied = a.nelement() * a.element_size() # bytes - bandwidth = 2 * bytes_copied / elapsed_time / 1e9 # GB/s + if from_device == device: + # because data has transferred from and back + factor = 2 + else: + factor = 1 + bandwidth = factor * bytes_copied / elapsed_time / 1e9 # GB/s print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ') @@ -102,5 +109,5 @@ def memory_bandwidth_benchmark(device): if __name__ == "__main__": print(f'benchmarking {device}') - flops_benchmark(device) - memory_bandwidth_benchmark(device) + flops_benchmark() + memory_bandwidth_benchmark()