diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py index 643932b5738e..ac380655d136 100644 --- a/benchmark/opperf/nd_operations/nn_optimizer_operators.py +++ b/benchmark/opperf/nd_operations/nn_optimizer_operators.py @@ -16,8 +16,13 @@ # under the License. import mxnet as mx +from mxnet import nd + +from benchmark.opperf.utils.benchmark_utils import run_performance_test from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks from benchmark.opperf.utils.op_registry_utils import get_all_optimizer_operators +from benchmark.opperf.utils.common_utils import merge_map_list +from benchmark.opperf.rules.default_params import MX_OP_MODULE """Performance benchmark tests for MXNet Neural Network Optimizer Update Operators. @@ -33,6 +38,19 @@ 5. rmsprop_update 6. ftrl_update 7. adam_update +8. preloaded_multi_* + 8.1 preloaded_multi_sgd_mom_update + 8.2 preloaded_multi_sgd_update + 8.3 preloaded_multi_mp_sgd_update + 8.4 preloaded_multi_mp_sgd_mom_update +9. lamb_* + 9.1 lamb_update_phase1 + 9.2 lamb_update_phase2 +10. multi_* + 10.1 multi_sgd_update + 10.2 multi_sgd_mom_update + 10.3 multi_mp_sgd_update + 10.4 multi_mp_sgd_mom_update """ @@ -58,9 +76,60 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=' Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ - # Fetch all optimizer operators + # Run independent tests for ops that need specific input data + multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), + "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, + "out": nd.random_normal(shape=(5,5))}],run_backward=False) + + multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), + "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2, + "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)), + "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + preloaded_multi_mp_sgd_res = run_performance_test( + [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), + "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), + "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + preloaded_multi_sgd_mom_res = run_performance_test( + [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), + "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)), + "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)), + "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + preloaded_multi_sgd_res = run_performance_test( + [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), + "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), + "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + preloaded_multi_mp_sgd_mom_res = run_performance_test( + [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")], + inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)), + "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)), + "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)), + "out": nd.random_normal(shape=(5,5))}], run_backward=False) + + # Fetch remaining optimizer operators mx_optimizer_ops = get_all_optimizer_operators() # Run benchmarks mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs) - return mx_optimizer_op_results + return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\ + preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\ + [mx_optimizer_op_results]) diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py index fdbf01aa41c6..08075906fae5 100644 --- a/benchmark/opperf/nd_operations/unary_operators.py +++ b/benchmark/opperf/nd_operations/unary_operators.py @@ -34,6 +34,9 @@ from benchmark.opperf.utils.op_registry_utils import get_all_unary_operators from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list +from benchmark.opperf.rules.default_params import MX_OP_MODULE def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100): """Runs benchmarks with the given context and precision (dtype)for all the unary @@ -57,8 +60,22 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. """ + # Run amp_multicast as it needs data as positional argument + amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")], + run_backward=True, + dtype=dtype, + ctx=ctx, + profiler=profiler, + inputs=[{"args": [(1024, 1024)], + "num_outputs":1}, + {"args": [(10000, 1)], + "num_outputs":1}], + warmup=warmup, + runs=runs) + # Fetch all Unary Operators mx_unary_broadcast_ops = get_all_unary_operators() + # Run benchmarks mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs) - return mx_unary_op_results + return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results]) diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py index 6fca0d553288..8f10e4ebbeb6 100644 --- a/benchmark/opperf/rules/default_params.py +++ b/benchmark/opperf/rules/default_params.py @@ -31,6 +31,9 @@ # For Unary operators like abs, arccos, arcsin etc.. DEFAULT_DATA = [(1024, 1024), (10000, 1), (10000, 100)] +DEFAULT_DTYPE = ['float32', 'int32', 'float32'] # required parameter for amp_cast, cast +DEFAULT_DTYPE_INT = ['int32', 'int64', 'int32'] # randint works for int* types only +DEFAULT_DTYPE_FLOAT = ['float16', 'float32', 'float64'] # random_exp works for float* types only # For Binary miscellaneous operators like choose_element0_index # argument data must be indexed via an NDArray. @@ -89,6 +92,8 @@ DEFAULT_V = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_Z = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_G = [(1024, 1024), (10000, 1), (10000, 100)] +DEFAULT_R1 = [(1, 1024), (1, 1), (1, 100)] +DEFAULT_R2 = [(1, 1024), (1, 1), (1, 100)] DEFAULT_DELTA = [(1024, 1024), (10000, 1), (10000, 100)] DEFAULT_LRS = [(0.1, 0.1)] DEFAULT_LR = [0.1, 0.5, 0.9] @@ -148,6 +153,9 @@ # Default Inputs. MXNet Op Param Name to Default Input mapping DEFAULTS_INPUTS = {"data": DEFAULT_DATA, + "dtype": DEFAULT_DTYPE, + "dtype_int": DEFAULT_DTYPE_INT, + "dtype_float": DEFAULT_DTYPE_FLOAT, "sample": DEFAULT_SAMPLE, "lhs": DEFAULT_LHS, "rhs": DEFAULT_RHS, @@ -173,6 +181,8 @@ "mean": DEFAULT_MEAN, "var": DEFAULT_VAR, "mom": DEFAULT_MOM, + "r1": DEFAULT_R1, + "r2": DEFAULT_R2, "n": DEFAULT_N, "d": DEFAULT_D, "v": DEFAULT_V, @@ -182,6 +192,7 @@ "lr": DEFAULT_LR, "lrs": DEFAULT_LRS, "wds": DEFAULT_LRS, + "wd": DEFAULT_LR, "gamma1": DEFAULT_GAMMA_1, "gamma2": DEFAULT_GAMMA_2, "epsilon": DEFAULT_EPSILON, @@ -239,4 +250,4 @@ "weight", "weight32", "grad", "mean", "var", "mom", "n", "d", "v", "z", "g", "delta", "args", "indices", "shape_like", "y", "x", "condition", "a", "index", "raveL_data", "label", "grid", - "A", "B", "C", "rois"] + "A", "B", "C", "r1", "r2", "rois"] diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py index da7e2b8910aa..60914118a56e 100644 --- a/benchmark/opperf/utils/benchmark_utils.py +++ b/benchmark/opperf/utils/benchmark_utils.py @@ -31,17 +31,11 @@ def _prepare_op_inputs(inputs, run_backward, dtype, ctx): mx.random.seed(41) kwargs_list = [] - args_list = [] for inp in inputs: kwargs = {} for key, value in inp.items(): - if key in PARAMS_OF_TYPE_NDARRAY and key=='args': - args_list.append(get_mx_ndarray(ctx=ctx, in_tensor=value, - dtype=dtype, - initializer=nd.normal, - attach_grad=run_backward)) - elif key in PARAMS_OF_TYPE_NDARRAY: + if key in PARAMS_OF_TYPE_NDARRAY: kwargs[key] = get_mx_ndarray(ctx=ctx, in_tensor=value, dtype=dtype, initializer=nd.normal, @@ -49,10 +43,60 @@ def _prepare_op_inputs(inputs, run_backward, dtype, ctx): else: kwargs[key] = value kwargs_list.append(kwargs) - return args_list, kwargs_list + return kwargs_list -def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, args_list, kwargs_list, profiler): +def parse_input_ndarray(input_dict): + """Parse input for ndarray and extract array shape for better readability + + Parameters + ---------- + input_dict : dict + Dictionary of input + + Input Dictionary + + 'inputs': {'weight': + [[ 2.2122064 0.7740038 1.0434405 1.1839255 1.8917114 ] + [-1.2347414 -1.771029 -0.45138445 0.57938355 -1.856082 ] + [-1.9768796 -0.20801921 0.2444218 -0.03716067 -0.48774993] + [-0.02261727 0.57461417 1.4661262 0.6862904 0.35496104] + [ 1.0731696 0.12017461 -0.9711102 -0.77569664 -0.7882176 ]] + , 'grad': + [[ 0.7417728 -1.4734439 -1.0730928 -1.0424827 -1.3278849 ] + [-1.4749662 -0.52414197 1.2662556 0.8950642 -0.6015945 ] + [ 1.2040559 -0.9712193 -0.58256227 0.3717077 0.9300072 ] + [-1.4225755 -0.5176199 2.0088325 0.2863085 0.5604595 ] + [ 0.96975976 -0.52853745 -1.88909 0.65479124 -0.45481315]] + , 'mean': + [[ 0.32510808 -1.3002341 0.3679345 1.4534262 0.24154152] + [ 0.47898006 0.96885103 -1.0218245 -0.06812762 -0.31868345] + [-0.17634277 0.35655284 0.74419165 0.7787424 0.6087823 ] + [ 1.0741756 0.06642842 0.8486986 -0.8003802 -0.16882208] + [ 0.93632793 0.357444 0.77932847 -1.0103073 -0.39157307]] + , 'var': + [[ 1.3166187 -0.43292624 0.71535987 0.9254156 -0.90495086] + [-0.074684 0.82254 -1.8785107 0.8858836 1.9118724 ] + [ 0.33342266 0.11883813 -1.9198899 -0.67558455 1.007749 ] + [-0.35391203 1.6323917 -0.33354783 -1.7378405 0.7737382 ] + [ 0.89126545 3.2904532 -1.1976235 1.8938874 -0.5669272 ]] + , 't': 1, 'wd': 0.1} + + Output + {'inputs': {'weight': '', 'grad': '', 'mean': '', 'var': '', 't': 1, 'wd': 0.1} + """ + no_new_line_input_dict=dict() + for key,value in input_dict.items(): + if isinstance(value,nd.NDArray): + # if value in input is NDArray then extract last line only + val = str(value).split('\n')[-1] + no_new_line_input_dict[key]=val + else: + no_new_line_input_dict[key]=value + return no_new_line_input_dict + + +def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler): if profiler == 'native': if run_backward: benchmark_helper_func = cpp_profile(nd_forward_backward_and_profile) @@ -67,28 +111,20 @@ def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, ar raise ValueError("Incorrect input for profiler. Valid input - 'python' or 'native'") # Warm up, ignore the profiler output - if not args_list: - _, _ = benchmark_helper_func(op, warmup, [], **kwargs_list[0]) - else: - _, _ = benchmark_helper_func(op, warmup, args_list[0], **kwargs_list[0]) + _, _ = benchmark_helper_func(op, warmup, **kwargs_list[0]) # Run Benchmarks op_benchmark_result = {op.__name__: []} logging.info("Begin Benchmark - {name}".format(name=op.__name__)) - if not args_list: - for idx, kwargs in enumerate(kwargs_list): - _, profiler_output = benchmark_helper_func(op, runs, [], **kwargs) - # Add inputs used for profiling this operator into result - profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output]) - op_benchmark_result[op.__name__].append(profiler_output) - else: - for idx, (args, kwargs) in enumerate(zip(args_list, kwargs_list)): - _, profiler_output = benchmark_helper_func(op, runs, args, **kwargs) + for idx, kwargs in enumerate(kwargs_list): + _, profiler_output = benchmark_helper_func(op, runs, **kwargs) - # Add inputs used for profiling this operator into result - profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output]) - op_benchmark_result[op.__name__].append(profiler_output) + # Add inputs used for profiling this operator into result + # parse input if it contains ndarray, replace with shape info for better markdown readability + new_inp = parse_input_ndarray(inputs[idx]) + profiler_output = merge_map_list([{"inputs": new_inp}] + [profiler_output]) + op_benchmark_result[op.__name__].append(profiler_output) logging.info("Complete Benchmark - {name}".format(name=op.__name__)) return op_benchmark_result @@ -128,7 +164,7 @@ def run_performance_test(ops, inputs, run_backward=True, List of dictionary of benchmark results. key -> name of the operator, Value is benchmark results. """ - args_list, kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx) + kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx) if not isinstance(ops, list): ops = [ops] @@ -136,7 +172,7 @@ def run_performance_test(ops, inputs, run_backward=True, op_benchmark_result = [] for op in ops: if hasattr(mx.nd, op.__name__): - benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, args_list, kwargs_list, profiler) + benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler) else: raise ValueError("Unknown NDArray operator provided to benchmark. - ", op.__name__) op_benchmark_result.append(benchmark_result) diff --git a/benchmark/opperf/utils/ndarray_utils.py b/benchmark/opperf/utils/ndarray_utils.py index 5d1bbbd8282d..3f5dda8f036b 100644 --- a/benchmark/opperf/utils/ndarray_utils.py +++ b/benchmark/opperf/utils/ndarray_utils.py @@ -20,7 +20,7 @@ import mxnet.ndarray as nd -def nd_forward_backward_and_profile(op, runs, *args, **kwargs): +def nd_forward_backward_and_profile(op, runs, **kwargs): """Helper function to run a given NDArray operator (op) for 'runs' number of times with given args and kwargs. Executes both forward and backward pass. @@ -32,8 +32,6 @@ def nd_forward_backward_and_profile(op, runs, *args, **kwargs): NDArray operator (Function reference) to execute. Example: mx.nd.add runs: int Number of times to execute the operation - args: - Arguments for the NDArray operator (op) being executed. kwargs: Key value arguments for the NDArray operator (op) being executed. @@ -44,16 +42,26 @@ def nd_forward_backward_and_profile(op, runs, *args, **kwargs): """ for _ in range(runs): with mx.autograd.record(): - if not isinstance(args[0], nd.NDArray): - res = op(**kwargs) + args = [] + # need to create a new dictionary because can't update dict while iterating + kwargs_new = dict() + for key in kwargs: + # separate positional args from key-worded args + if key.startswith("args"): + args.append(kwargs[key]) + else: + kwargs_new[key]=kwargs[key] + # check for positional args + if len(args): + res = op(*args, **kwargs_new) else: - res = op(*args, **kwargs) + res = op(**kwargs_new) res.backward() nd.waitall() return res -def nd_forward_and_profile(op, runs, *args, **kwargs): +def nd_forward_and_profile(op, runs, **kwargs): """Helper function to run a given NDArray operator (op) for 'runs' number of times with given args and kwargs. Executes ONLY forward pass. @@ -65,8 +73,6 @@ def nd_forward_and_profile(op, runs, *args, **kwargs): NDArray operator (Function reference) to execute. Example: mx.nd.add runs: int Number of time to execute the operation - args: - Arguments for the NDArray operator (op) being executed. kwargs: Key value arguments for the NDArray operator (op) being executed. @@ -75,10 +81,20 @@ def nd_forward_and_profile(op, runs, *args, **kwargs): any results from NDArray operation execution """ for _ in range(runs): - if not isinstance(args[0], nd.NDArray): - res = op(**kwargs) + args = [] + # need to create a new dictionary because can't update dict while iterating + kwargs_new = dict() + for key in kwargs: + # separate positional args from key-worded args + if key.startswith("args"): + args.append(kwargs[key]) + else: + kwargs_new[key]=kwargs[key] + # check for positional args + if len(args): + res = op(*args, **kwargs_new) else: - res = op(*args, **kwargs) + res = op(**kwargs_new) nd.waitall() return res diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py index b9f1e45bbd37..d2598310e852 100644 --- a/benchmark/opperf/utils/op_registry_utils.py +++ b/benchmark/opperf/utils/op_registry_utils.py @@ -121,11 +121,21 @@ def prepare_op_inputs(op, arg_params): # For ops with args that need to change shape/value for different ops custom_data = ['Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian'] + int_only = ['random_randint'] + # Prepare op to default input mapping arg_values = {} for arg_name, arg_type in zip(arg_params["params"]["arg_names"], arg_params["params"]["arg_types"]): - if "NDArray" in arg_type and op == "ravel_multi_index": + # Due to lack of an internal API for fetching permissible dtype + # added a logic for using float only dtype as input for ops that take only floats + # same for randint (which is the only op that takes only int as input) + # rest all operators take int as well as float + if op in int_only and arg_name == "dtype": + arg_values[arg_name] = DEFAULTS_INPUTS["dtype_int"] + elif op.startswith(('random','sample')) and arg_name == "dtype": + arg_values[arg_name] = DEFAULTS_INPUTS["dtype_float"] + elif "NDArray" in arg_type and op == "ravel_multi_index": arg_values[arg_name] = DEFAULTS_INPUTS["ravel_data"] elif op in custom_data and arg_name + "_" + op.lower() in DEFAULTS_INPUTS: arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_" + op.lower()] @@ -174,14 +184,18 @@ def get_all_unary_operators(): ------- {"operator_name": {"has_backward", "nd_op_handle", "params"}} """ + # Cast operators (cast & amp_cast are unary) + cast_ops = ['cast', 'amp_cast'] + # Get all mxnet operators mx_operators = _get_all_mxnet_operators() # Filter for unary broadcast operators unary_broadcast_mx_operators = {} for op_name, op_params in mx_operators.items(): - if op_params["params"]["narg"] == 1 and \ - "data" in op_params["params"]["arg_names"]: + if (op_params["params"]["narg"] == 1 and \ + "data" in op_params["params"]["arg_names"]) or \ + op_name in cast_ops: unary_broadcast_mx_operators[op_name] = mx_operators[op_name] return unary_broadcast_mx_operators @@ -305,8 +319,9 @@ def get_all_reduction_operators(): # Filter for Reduction operators reduction_mx_operators = {} for op_name, op_params in mx_operators.items(): - if op_params["params"]["narg"] == 4 and \ - set(["data", "axis", "exclude", "keepdims"]).issubset(set(op_params["params"]["arg_names"])): + if (op_params["params"]["narg"] == 4 and \ + set(["data", "axis", "exclude", "keepdims"]).issubset(set(op_params["params"]["arg_names"])) \ + or op_name == 'norm'): reduction_mx_operators[op_name] = mx_operators[op_name] return reduction_mx_operators @@ -340,7 +355,8 @@ def get_all_optimizer_operators(): """ optimizer_ops = ['mp_sgd_update', 'signum_update', 'rmspropalex_update', 'ftml_update', 'rmsprop_update', 'sgd_mom_update', 'signsgd_update', 'mp_sgd_mom_update', 'ftrl_update', 'sgd_update', - 'adam_update'] + 'adam_update', 'mp_nag_mom_update', 'nag_mom_update', 'lamb_update_phase1', + 'lamb_update_phase2'] # Get all mxnet operators mx_operators = _get_all_mxnet_operators()