Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[OpPerf] Add norm, cast ops, remaining optimizer ops (#17542)
Browse files Browse the repository at this point in the history
* add mp_nag_mom, nag_mom, lamb_update_phase_1&2 op

* add norm to reduction op

* add preloaded_*, multi_* optimizer ops

* add cast ops to unary op opperf

* change API to handle args in profiler_util instead of benchmark_util

* clean up positional args

* fix amp_cast,cast and lamb_update_* issue

* fix markdown readability issue

* add 3 types of dtype vars as inputs for 3 diff category of ops
  • Loading branch information
ChaiBapchya committed Feb 13, 2020
1 parent eecf2ed commit 93c123d
Show file tree
Hide file tree
Showing 6 changed files with 214 additions and 49 deletions.
73 changes: 71 additions & 2 deletions benchmark/opperf/nd_operations/nn_optimizer_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@
# under the License.

import mxnet as mx
from mxnet import nd

from benchmark.opperf.utils.benchmark_utils import run_performance_test
from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
from benchmark.opperf.utils.op_registry_utils import get_all_optimizer_operators
from benchmark.opperf.utils.common_utils import merge_map_list
from benchmark.opperf.rules.default_params import MX_OP_MODULE

"""Performance benchmark tests for MXNet Neural Network Optimizer Update Operators.
Expand All @@ -33,6 +38,19 @@
5. rmsprop_update
6. ftrl_update
7. adam_update
8. preloaded_multi_*
8.1 preloaded_multi_sgd_mom_update
8.2 preloaded_multi_sgd_update
8.3 preloaded_multi_mp_sgd_update
8.4 preloaded_multi_mp_sgd_mom_update
9. lamb_*
9.1 lamb_update_phase1
9.2 lamb_update_phase2
10. multi_*
10.1 multi_sgd_update
10.2 multi_sgd_mom_update
10.3 multi_mp_sgd_update
10.4 multi_mp_sgd_mom_update
"""


Expand All @@ -58,9 +76,60 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
"""
# Fetch all optimizer operators
# Run independent tests for ops that need specific input data
multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
"args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
"out": nd.random_normal(shape=(5,5))}],run_backward=False)

multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
"lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)

multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
"out": nd.random_normal(shape=(5,5))}], run_backward=False)

multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
"lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)

preloaded_multi_mp_sgd_res = run_performance_test(
[getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
"args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
"out": nd.random_normal(shape=(5,5))}], run_backward=False)

preloaded_multi_sgd_mom_res = run_performance_test(
[getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)),
"args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
"args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
"out": nd.random_normal(shape=(5,5))}], run_backward=False)

preloaded_multi_sgd_res = run_performance_test(
[getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
"args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
"out": nd.random_normal(shape=(5,5))}], run_backward=False)

preloaded_multi_mp_sgd_mom_res = run_performance_test(
[getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")],
inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
"args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)),
"args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
"out": nd.random_normal(shape=(5,5))}], run_backward=False)

# Fetch remaining optimizer operators
mx_optimizer_ops = get_all_optimizer_operators()

# Run benchmarks
mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs)
return mx_optimizer_op_results
return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\
preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\
[mx_optimizer_op_results])
19 changes: 18 additions & 1 deletion benchmark/opperf/nd_operations/unary_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
from benchmark.opperf.utils.op_registry_utils import get_all_unary_operators
from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks

from benchmark.opperf.utils.benchmark_utils import run_performance_test
from benchmark.opperf.utils.common_utils import merge_map_list
from benchmark.opperf.rules.default_params import MX_OP_MODULE

def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
"""Runs benchmarks with the given context and precision (dtype)for all the unary
Expand All @@ -57,8 +60,22 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
"""
# Run amp_multicast as it needs data as positional argument
amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")],
run_backward=True,
dtype=dtype,
ctx=ctx,
profiler=profiler,
inputs=[{"args": [(1024, 1024)],
"num_outputs":1},
{"args": [(10000, 1)],
"num_outputs":1}],
warmup=warmup,
runs=runs)

# Fetch all Unary Operators
mx_unary_broadcast_ops = get_all_unary_operators()

# Run benchmarks
mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
return mx_unary_op_results
return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results])
13 changes: 12 additions & 1 deletion benchmark/opperf/rules/default_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@

# For Unary operators like abs, arccos, arcsin etc..
DEFAULT_DATA = [(1024, 1024), (10000, 1), (10000, 100)]
DEFAULT_DTYPE = ['float32', 'int32', 'float32'] # required parameter for amp_cast, cast
DEFAULT_DTYPE_INT = ['int32', 'int64', 'int32'] # randint works for int* types only
DEFAULT_DTYPE_FLOAT = ['float16', 'float32', 'float64'] # random_exp works for float* types only

# For Binary miscellaneous operators like choose_element0_index
# argument data must be indexed via an NDArray.
Expand Down Expand Up @@ -89,6 +92,8 @@
DEFAULT_V = [(1024, 1024), (10000, 1), (10000, 100)]
DEFAULT_Z = [(1024, 1024), (10000, 1), (10000, 100)]
DEFAULT_G = [(1024, 1024), (10000, 1), (10000, 100)]
DEFAULT_R1 = [(1, 1024), (1, 1), (1, 100)]
DEFAULT_R2 = [(1, 1024), (1, 1), (1, 100)]
DEFAULT_DELTA = [(1024, 1024), (10000, 1), (10000, 100)]
DEFAULT_LRS = [(0.1, 0.1)]
DEFAULT_LR = [0.1, 0.5, 0.9]
Expand Down Expand Up @@ -148,6 +153,9 @@

# Default Inputs. MXNet Op Param Name to Default Input mapping
DEFAULTS_INPUTS = {"data": DEFAULT_DATA,
"dtype": DEFAULT_DTYPE,
"dtype_int": DEFAULT_DTYPE_INT,
"dtype_float": DEFAULT_DTYPE_FLOAT,
"sample": DEFAULT_SAMPLE,
"lhs": DEFAULT_LHS,
"rhs": DEFAULT_RHS,
Expand All @@ -173,6 +181,8 @@
"mean": DEFAULT_MEAN,
"var": DEFAULT_VAR,
"mom": DEFAULT_MOM,
"r1": DEFAULT_R1,
"r2": DEFAULT_R2,
"n": DEFAULT_N,
"d": DEFAULT_D,
"v": DEFAULT_V,
Expand All @@ -182,6 +192,7 @@
"lr": DEFAULT_LR,
"lrs": DEFAULT_LRS,
"wds": DEFAULT_LRS,
"wd": DEFAULT_LR,
"gamma1": DEFAULT_GAMMA_1,
"gamma2": DEFAULT_GAMMA_2,
"epsilon": DEFAULT_EPSILON,
Expand Down Expand Up @@ -239,4 +250,4 @@
"weight", "weight32", "grad", "mean", "var", "mom", "n", "d",
"v", "z", "g", "delta", "args", "indices", "shape_like", "y",
"x", "condition", "a", "index", "raveL_data", "label", "grid",
"A", "B", "C", "rois"]
"A", "B", "C", "r1", "r2", "rois"]
90 changes: 63 additions & 27 deletions benchmark/opperf/utils/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,72 @@
def _prepare_op_inputs(inputs, run_backward, dtype, ctx):
mx.random.seed(41)
kwargs_list = []
args_list = []

for inp in inputs:
kwargs = {}
for key, value in inp.items():
if key in PARAMS_OF_TYPE_NDARRAY and key=='args':
args_list.append(get_mx_ndarray(ctx=ctx, in_tensor=value,
dtype=dtype,
initializer=nd.normal,
attach_grad=run_backward))
elif key in PARAMS_OF_TYPE_NDARRAY:
if key in PARAMS_OF_TYPE_NDARRAY:
kwargs[key] = get_mx_ndarray(ctx=ctx, in_tensor=value,
dtype=dtype,
initializer=nd.normal,
attach_grad=run_backward)
else:
kwargs[key] = value
kwargs_list.append(kwargs)
return args_list, kwargs_list
return kwargs_list


def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, args_list, kwargs_list, profiler):
def parse_input_ndarray(input_dict):
"""Parse input for ndarray and extract array shape for better readability
Parameters
----------
input_dict : dict
Dictionary of input
Input Dictionary
'inputs': {'weight':
[[ 2.2122064 0.7740038 1.0434405 1.1839255 1.8917114 ]
[-1.2347414 -1.771029 -0.45138445 0.57938355 -1.856082 ]
[-1.9768796 -0.20801921 0.2444218 -0.03716067 -0.48774993]
[-0.02261727 0.57461417 1.4661262 0.6862904 0.35496104]
[ 1.0731696 0.12017461 -0.9711102 -0.77569664 -0.7882176 ]]
<NDArray 5x5 @cpu(0)>, 'grad':
[[ 0.7417728 -1.4734439 -1.0730928 -1.0424827 -1.3278849 ]
[-1.4749662 -0.52414197 1.2662556 0.8950642 -0.6015945 ]
[ 1.2040559 -0.9712193 -0.58256227 0.3717077 0.9300072 ]
[-1.4225755 -0.5176199 2.0088325 0.2863085 0.5604595 ]
[ 0.96975976 -0.52853745 -1.88909 0.65479124 -0.45481315]]
<NDArray 5x5 @cpu(0)>, 'mean':
[[ 0.32510808 -1.3002341 0.3679345 1.4534262 0.24154152]
[ 0.47898006 0.96885103 -1.0218245 -0.06812762 -0.31868345]
[-0.17634277 0.35655284 0.74419165 0.7787424 0.6087823 ]
[ 1.0741756 0.06642842 0.8486986 -0.8003802 -0.16882208]
[ 0.93632793 0.357444 0.77932847 -1.0103073 -0.39157307]]
<NDArray 5x5 @cpu(0)>, 'var':
[[ 1.3166187 -0.43292624 0.71535987 0.9254156 -0.90495086]
[-0.074684 0.82254 -1.8785107 0.8858836 1.9118724 ]
[ 0.33342266 0.11883813 -1.9198899 -0.67558455 1.007749 ]
[-0.35391203 1.6323917 -0.33354783 -1.7378405 0.7737382 ]
[ 0.89126545 3.2904532 -1.1976235 1.8938874 -0.5669272 ]]
<NDArray 5x5 @cpu(0)>, 't': 1, 'wd': 0.1}
Output
{'inputs': {'weight': '<NDArray 5x5 @cpu(0)>', 'grad': '<NDArray 5x5 @cpu(0)>', 'mean': '<NDArray 5x5 @cpu(0)>', 'var': '<NDArray 5x5 @cpu(0)>', 't': 1, 'wd': 0.1}
"""
no_new_line_input_dict=dict()
for key,value in input_dict.items():
if isinstance(value,nd.NDArray):
# if value in input is NDArray then extract last line only
val = str(value).split('\n')[-1]
no_new_line_input_dict[key]=val
else:
no_new_line_input_dict[key]=value
return no_new_line_input_dict


def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler):
if profiler == 'native':
if run_backward:
benchmark_helper_func = cpp_profile(nd_forward_backward_and_profile)
Expand All @@ -67,28 +111,20 @@ def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, ar
raise ValueError("Incorrect input for profiler. Valid input - 'python' or 'native'")

# Warm up, ignore the profiler output
if not args_list:
_, _ = benchmark_helper_func(op, warmup, [], **kwargs_list[0])
else:
_, _ = benchmark_helper_func(op, warmup, args_list[0], **kwargs_list[0])
_, _ = benchmark_helper_func(op, warmup, **kwargs_list[0])

# Run Benchmarks
op_benchmark_result = {op.__name__: []}
logging.info("Begin Benchmark - {name}".format(name=op.__name__))
if not args_list:
for idx, kwargs in enumerate(kwargs_list):
_, profiler_output = benchmark_helper_func(op, runs, [], **kwargs)

# Add inputs used for profiling this operator into result
profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output])
op_benchmark_result[op.__name__].append(profiler_output)
else:
for idx, (args, kwargs) in enumerate(zip(args_list, kwargs_list)):
_, profiler_output = benchmark_helper_func(op, runs, args, **kwargs)
for idx, kwargs in enumerate(kwargs_list):
_, profiler_output = benchmark_helper_func(op, runs, **kwargs)

# Add inputs used for profiling this operator into result
profiler_output = merge_map_list([{"inputs": inputs[idx]}] + [profiler_output])
op_benchmark_result[op.__name__].append(profiler_output)
# Add inputs used for profiling this operator into result
# parse input if it contains ndarray, replace with shape info for better markdown readability
new_inp = parse_input_ndarray(inputs[idx])
profiler_output = merge_map_list([{"inputs": new_inp}] + [profiler_output])
op_benchmark_result[op.__name__].append(profiler_output)
logging.info("Complete Benchmark - {name}".format(name=op.__name__))
return op_benchmark_result

Expand Down Expand Up @@ -128,15 +164,15 @@ def run_performance_test(ops, inputs, run_backward=True,
List of dictionary of benchmark results. key -> name of the operator, Value is benchmark results.
"""
args_list, kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx)
kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx)

if not isinstance(ops, list):
ops = [ops]

op_benchmark_result = []
for op in ops:
if hasattr(mx.nd, op.__name__):
benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, args_list, kwargs_list, profiler)
benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list, profiler)
else:
raise ValueError("Unknown NDArray operator provided to benchmark. - ", op.__name__)
op_benchmark_result.append(benchmark_result)
Expand Down
Loading

0 comments on commit 93c123d

Please sign in to comment.