diff --git a/benchmark.py b/benchmark.py
index e93b21e..f51a6d8 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -22,7 +22,7 @@
 num_trails = args.num_trails
 
 
-def flops_benchmark(device):
+def flops_benchmark():
     test_range = 2 ** np.arange(8, 13, 0.25)
 
     print('size, elapsed_time, flops')
@@ -57,9 +57,11 @@ def synchronize(device):
         pass
 
 
-def memory_bandwidth_benchmark(device):
+def memory_bandwidth_benchmark(from_device=device):
+    from_device = torch.device(from_device)
     test_range = 2 ** (np.arange(20, 27, 0.5))
 
+    print(f'measuring bw from {from_device} to {device}')
     print('size (GB), elapsed_time, bandwidth')
     for size in test_range:
         elapsed_time = 0
@@ -68,7 +70,7 @@ def memory_bandwidth_benchmark(device):
 
             # Create random tensors
             a = torch.rand(size, device=device)
-            b = torch.rand(size, device=device)
+            b = torch.rand(size, device=from_device)
 
             # Warm-up to ensure CUDA kernel is initialized if using GPU
             synchronize(device)
@@ -93,7 +95,12 @@ def memory_bandwidth_benchmark(device):
         elapsed_time = elapsed_time / num_trails
         # Calculate Bandwidth in GB/s
         bytes_copied = a.nelement() * a.element_size()  # bytes
-        bandwidth = 2 * bytes_copied / elapsed_time / 1e9  # GB/s
+        if from_device == device:
+            # because data has transferred from and back
+            factor = 2
+        else:
+            factor = 1
+        bandwidth = factor * bytes_copied / elapsed_time / 1e9  # GB/s
 
         print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ')
 
@@ -102,5 +109,5 @@ def memory_bandwidth_benchmark(device):
 
 if __name__ == "__main__":
     print(f'benchmarking {device}')
-    flops_benchmark(device)
-    memory_bandwidth_benchmark(device)
+    flops_benchmark()
+    memory_bandwidth_benchmark()