[Opperf] Make module/namespace of the operator parameterized (#15226)

* Make module/namespace parameterized to choose between mx.nd or mx.np * Fix comments * Add automated way to fetch compile/runtime flags for MXNet * Fix warmup and runs count * Fix Pooling operator benchmarks
apache · Jun 28, 2019 · e8f3e91 · e8f3e91
1 parent 92fce90
commit e8f3e91
Show file tree

Hide file tree

Showing 14 changed files with 421 additions and 65 deletions.
diff --git a/benchmark/opperf/README.md b/benchmark/opperf/README.md
@@ -24,10 +24,11 @@ With this utility, for each MXNet operator you can get the following details:
 **Timing**
 1. Forward execution time
 2. Backward execution time
-3. Time spent for memory management
 
 **Memory**
-1. Total memory allocated
+1. Average and Max memory allocated
+
+NOTE: This is the `pool memory`. It does not reflect the exact memory requested by the operator.
 
 # Motivation
 

diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,7 +38,7 @@
     get_all_elemen_wise_binary_operators
 
 
-def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the binary
     broadcast operators in MXNet.
 
@@ -48,9 +48,9 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns
@@ -65,7 +65,7 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
     return mx_binary_op_results
 
 
-def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the binary
     element_wise operators in MXNet.
 

diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -16,10 +16,9 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import nd
 from benchmark.opperf.utils.benchmark_utils import run_performance_test
 from benchmark.opperf.utils.common_utils import merge_map_list
-
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
 """Performance benchmark tests for MXNet NDArray GEMM Operators.
 
 1. dot
@@ -35,7 +34,7 @@
 """
 
 
-def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the GEMM
     operators (dot, batch_dot) in MXNet.
 
@@ -45,9 +44,9 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns
@@ -57,7 +56,7 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs
     """
     # Benchmark tests for dot and batch_dot operators
     dot_benchmark_res = run_performance_test(
-        [nd.dot], run_backward=True,
+        [getattr(MX_OP_MODULE, "dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
         inputs=[{"lhs": (1024, 1024),
                  "rhs": (1024, 1024)},
@@ -71,7 +70,7 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs
         warmup=warmup, runs=runs)
 
     batch_dot_benchmark_res = run_performance_test(
-        [nd.batch_dot], run_backward=True,
+        [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
         inputs=[{"lhs": (32, 1024, 1024),
                  "rhs": (32, 1024, 1024)},

diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -16,9 +16,9 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import nd
 from benchmark.opperf.utils.benchmark_utils import run_performance_test
 from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
 """Performance benchmark tests for MXNet NDArray Activation Operators.
 
@@ -35,7 +35,7 @@
 """
 
 
-def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the activation
     operators (relu, sigmoid, softmax) in MXNet.
 
@@ -45,9 +45,9 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns
@@ -56,7 +56,7 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10
 
     """
     # Relu and its variation
-    relu_benchmark_res = run_performance_test([nd.LeakyReLU],
+    relu_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "LeakyReLU")],
                                               run_backward=True,
                                               dtype=dtype,
                                               ctx=ctx,
@@ -78,7 +78,7 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10
 
     # Sigmoid => Covered as part of Unary ops
     # Hard_Sigmoid
-    hard_sigmoid_benchmark_res = run_performance_test([nd.hard_sigmoid],
+    hard_sigmoid_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "hard_sigmoid")],
                                                       run_backward=True,
                                                       dtype=dtype,
                                                       ctx=ctx,
@@ -90,7 +90,8 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10
                                                       runs=runs)
 
     # Softmax, LogSoftmax
-    softmax_benchmark_res = run_performance_test([nd.softmax, nd.log_softmax],
+    softmax_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "softmax"),
+                                                  getattr(MX_OP_MODULE, "log_softmax")],
                                                  run_backward=True,
                                                  dtype=dtype,
                                                  ctx=ctx,

diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -16,9 +16,9 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import nd
 from benchmark.opperf.utils.benchmark_utils import run_performance_test
 from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
 """Performance benchmark tests for MXNet NDArray basic NN Operators.
 
@@ -29,9 +29,9 @@
 """
 
 
-def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     # FullyConnnected operator benchmarks
-    fc_benchmark_res = run_performance_test([nd.FullyConnected],
+    fc_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "FullyConnected")],
                                             run_backward=True,
                                             dtype=dtype,
                                             ctx=ctx,
@@ -49,7 +49,7 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10,
                                             runs=runs)
 
     # Dropout benchmarks
-    dropout_benchmark_res = run_performance_test([nd.Dropout],
+    dropout_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "Dropout")],
                                                  run_backward=True,
                                                  dtype=dtype,
                                                  ctx=ctx,
@@ -62,7 +62,7 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10,
                                                  warmup=warmup,
                                                  runs=runs)
     # BatchNorm benchmarks
-    batchnorm_benchmark_res = run_performance_test([nd.BatchNorm],
+    batchnorm_benchmark_res = run_performance_test([getattr(MX_OP_MODULE, "BatchNorm")],
                                                    run_backward=True,
                                                    dtype=dtype,
                                                    ctx=ctx,

diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -16,9 +16,9 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import nd
 from benchmark.opperf.utils.benchmark_utils import run_performance_test
 from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
 """Performance benchmark tests for MXNet NDArray Convolution and Pooling Operators.
 
@@ -51,7 +51,7 @@
 """
 
 
-def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     pool_types = ['avg', 'max', 'sum']
     global_pool_types = [0, 1]
 
@@ -61,7 +61,7 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, r
     for pool_type in pool_types:
         for global_pool in global_pool_types:
             for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
-                pool1d_benchmark_res += run_performance_test([nd.Pooling],
+                pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
                                                              ctx=ctx,
@@ -70,13 +70,12 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, r
                                                                       "pool_type": pool_type,
                                                                       "global_pool": global_pool,
                                                                       "stride": 1,
-                                                                      "pad": 1,
-                                                                      "layout": 'NCW'}
+                                                                      "pad": 1}
                                                                      ],
                                                              warmup=warmup,
                                                              runs=runs)
             for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
-                pool2d_benchmark_res += run_performance_test([nd.Pooling],
+                pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
                                                              ctx=ctx,
@@ -85,8 +84,7 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, r
                                                                       "pool_type": pool_type,
                                                                       "global_pool": global_pool,
                                                                       "stride": (1, 1),
-                                                                      "pad": (0, 0),
-                                                                      "layout": 'NCHW'}
+                                                                      "pad": (0, 0)}
                                                                      ],
                                                              warmup=warmup,
                                                              runs=runs)
@@ -95,11 +93,11 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, r
     return mx_pooling_op_results
 
 
-def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     # Conv1D Benchmarks
     conv1d_benchmark_res = []
     for conv_data in [(32, 3, 256), (32, 3, 64)]:
-        conv1d_benchmark_res += run_performance_test([nd.Convolution],
+        conv1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
@@ -118,7 +116,7 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=1
     # Conv2D Benchmarks
     conv2d_benchmark_res = []
     for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
-        conv2d_benchmark_res += run_performance_test([nd.Convolution],
+        conv2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,

diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -34,7 +34,7 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators
 
 
-def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the random sampling
     operators in MXNet.
 
@@ -44,9 +44,9 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', w
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns

diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -31,7 +31,7 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the reduction
     operators in MXNet.
 
@@ -41,9 +41,9 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns

diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
@@ -35,7 +35,7 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype)for all the unary
     operators in MXNet.
 
@@ -45,9 +45,9 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10,
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
-    warmup: int, default 10
+    warmup: int, default 25
         Number of times to run for warmup
-    runs: int, default 50
+    runs: int, default 100
         Number of runs to capture benchmark results
 
     Returns

diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
@@ -40,7 +40,8 @@
 from benchmark.opperf.nd_operations.nn_basic_operators import run_nn_basic_operators_benchmarks
 
 from benchmark.opperf.utils.common_utils import merge_map_list, save_to_file
-from benchmark.opperf.utils.op_registry_utils import get_operators_with_no_benchmark
+from benchmark.opperf.utils.op_registry_utils import get_operators_with_no_benchmark,\
+    get_current_runtime_features
 
 
 def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32'):
@@ -102,17 +103,18 @@ def _parse_mxnet_context(ctx):
         device_id = int(ctx[4:-1])
         return mx.gpu(device_id)
 
+
 def main():
     # 1. GET USER INPUTS
-    parser = argparse.ArgumentParser(
-        description='Run all the MXNet operators (NDArray) benchmarks')
+    parser = argparse.ArgumentParser(description='Run all the MXNet operator benchmarks')
 
     parser.add_argument('--ctx', type=str, default='cpu',
                         help='Global context to run all benchmarks. By default, cpu on a '
                              'CPU machine, gpu(0) on a GPU machine. '
                              'Valid Inputs - cpu, gpu, gpu(0), gpu(1)...')
     parser.add_argument('--dtype', type=str, default='float32', help='DType (Precision) to run benchmarks. By default, '
-                                                                     'float32. Valid Inputs - float32, float64.')
+                                                                     'float32. Valid Inputs - float32, float64, int32, '
+                                                                     'int64')
     parser.add_argument('-f', '--output-format', type=str, default='json',
                         choices=['json', 'md'],
                         help='Benchmark result output format. By default, json. '
@@ -129,17 +131,20 @@ def main():
     # 2. RUN BENCHMARKS
     ctx = _parse_mxnet_context(args.ctx)
     dtype = args.dtype
-    final_benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=args.dtype)
+    final_benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype)
 
     # 3. PREPARE OUTPUTS
-    save_to_file(final_benchmark_results, args.output_file, args.output_format)
+    run_time_features = get_current_runtime_features()
+    save_to_file(final_benchmark_results, args.output_file, args.output_format, run_time_features)
 
     # 4. Generate list of MXNet operators not covered in benchmarks
     ops_not_covered = get_operators_with_no_benchmark(final_benchmark_results.keys())
     for idx, op in enumerate(ops_not_covered):
         print(f"{idx}. {op}")
+
     return 0
 
+
 if __name__ == '__main__':
     sys.exit(main())