From 44e157ee4cc8c311636c4c031db389acbd26aef9 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Tue, 6 Aug 2019 22:39:57 +0530 Subject: [PATCH 01/19] update code to fix #15759 --- src/operator/optimizer_op-inl.h | 51 +++++++++++++++++++++------------ src/operator/optimizer_op.cu | 3 +- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 50637a8e7b42..21e04b0033c5 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1293,15 +1293,38 @@ struct AdamParam : public dmlc::Parameter { } }; +struct AdamUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* mean_data, DType* var_data, const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType beta1, const DType beta2, const DType lr, + const DType wd, const DType epsilon, const DType rescale_grad, const OpReqType req) { + using namespace mshadow_op; + + const DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd; + if (clip_gradient >= 0.0f) { + mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * + clip::Map(grad_rescaled, clip_gradient); + var_data[i] = beta2 * var_data[i] + (1.f - beta2) * square::Map( + clip::Map(grad_rescaled, clip_gradient)); + } else { + mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled; + var_data[i] = beta2 * var_data[i] + + (1.f - beta2) * grad_rescaled * grad_rescaled; + } + + KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] / + (square_root::Map(var_data[i]) + epsilon)); + } +}; + template inline void AdamUpdate(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const AdamParam& param = nnvm::get(attrs.parsed); Stream* s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { @@ -1311,22 +1334,12 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, Tensor var = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * - F(grad, DType(param.clip_gradient)); - var = scalar(param.beta2)*var + scalar(1.f-param.beta2)*F( - F(grad, DType(param.clip_gradient))); - } else { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * grad; - var = scalar(param.beta2)*var + scalar(1.f-param.beta2) * F(grad); - } - Assign(out, req[0], - weight - - scalar(param.lr) * mean / - (F(var) + scalar(param.epsilon))); + Kernel::Launch(s, weight.shape_.Size(), out.dptr_, mean.dptr_, + var.dptr_, weight.dptr_, grad.dptr_, static_cast(param.clip_gradient), + static_cast(param.beta1), static_cast(param.beta2), + static_cast(param.lr), + static_cast(param.wd), static_cast(param.epsilon), + static_cast(param.rescale_grad), req[0]); }); } diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 2c72462de016..3fa60273e186 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -261,8 +261,7 @@ NNVM_REGISTER_OP(ftml_update) .set_attr("FCompute", FTMLUpdate); NNVM_REGISTER_OP(adam_update) -.set_attr("FCompute", AdamUpdate) -.set_attr("FComputeEx", AdamUpdateEx); +.set_attr("FCompute", AdamUpdate); NNVM_REGISTER_OP(rmsprop_update) .set_attr("FCompute", RMSPropUpdate); From 5d119826173c4a4530ff4ef8cc072d03426728c3 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Tue, 6 Aug 2019 23:05:19 +0530 Subject: [PATCH 02/19] add relevant test --- tests/python/unittest/test_ndarray.py | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index 0f154bd67a1a..bf8620460bd9 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -1799,6 +1799,38 @@ def check_save_load(save_is_np_shape, load_is_np_shape, shapes, save_throw_excep check_save_load(True, True, [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, False) +@with_seed() +def test_adam_update_mutate(): + def assert_mutate(x, y): return \ + np.testing.assert_raises( + AssertionError, np.testing.assert_allclose, x, y) + + def assert_unchanged(x, y): return \ + np.testing.assert_allclose(x, y) + + for dim in range(1, 7): + shape = rand_shape_nd(dim) + weight = mx.nd.random.normal(shape=shape) + grad = mx.nd.random.normal(shape=shape) + mean = mx.nd.random.normal(shape=shape) + var = mx.nd.random.normal(shape=shape) + + pre_weight, pre_grad, pre_mean, pre_var = map( + lambda x: x.asnumpy(), [weight, grad, mean, var]) + + # Operate + mx.nd.adam_update(weight, grad, mean, var, out=weight, lr=0.01, wd=1e-3) + + post_weight, post_grad, post_mean, post_var = map( + lambda x: x.asnumpy(), [weight, grad, mean, var]) + + # Assertions + assert_mutate(pre_weight, post_weight) + assert_mutate(pre_mean, post_mean) + assert_mutate(pre_var, post_var) + assert_unchanged(pre_grad, post_grad) + + if __name__ == '__main__': import nose nose.runmodule() From dc7e401ba8745b61c59b3985eaa1362be634b253 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Tue, 6 Aug 2019 23:53:58 +0530 Subject: [PATCH 03/19] re-add the removed conditional dispatch --- src/operator/optimizer_op.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 3fa60273e186..2c72462de016 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -261,7 +261,8 @@ NNVM_REGISTER_OP(ftml_update) .set_attr("FCompute", FTMLUpdate); NNVM_REGISTER_OP(adam_update) -.set_attr("FCompute", AdamUpdate); +.set_attr("FCompute", AdamUpdate) +.set_attr("FComputeEx", AdamUpdateEx); NNVM_REGISTER_OP(rmsprop_update) .set_attr("FCompute", RMSPropUpdate); From 1be1f01144df69b98bf423ee129a3efbd99796bd Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 7 Aug 2019 21:06:10 +0530 Subject: [PATCH 04/19] fix grad mutate for ftrl_update --- src/operator/optimizer_op-inl.h | 73 +++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 21e04b0033c5..52f15c7a9a3d 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1297,8 +1297,10 @@ struct AdamUpdateKernel { template MSHADOW_XINLINE static void Map(int i, DType* out_data, DType* mean_data, DType* var_data, const DType* weight_data, const DType* grad_data, - const DType clip_gradient, const DType beta1, const DType beta2, const DType lr, - const DType wd, const DType epsilon, const DType rescale_grad, const OpReqType req) { + const DType clip_gradient, const DType rescale_grad, + const DType beta1, const DType beta2, + const DType lr, const DType wd, + const DType epsilon, const OpReqType req) { using namespace mshadow_op; const DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd; @@ -1334,12 +1336,12 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, Tensor var = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - Kernel::Launch(s, weight.shape_.Size(), out.dptr_, mean.dptr_, - var.dptr_, weight.dptr_, grad.dptr_, static_cast(param.clip_gradient), + Kernel::Launch(s, weight.shape_.Size(), + out.dptr_, mean.dptr_, var.dptr_, weight.dptr_, grad.dptr_, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), static_cast(param.beta1), static_cast(param.beta2), - static_cast(param.lr), - static_cast(param.wd), static_cast(param.epsilon), - static_cast(param.rescale_grad), req[0]); + static_cast(param.lr), static_cast(param.wd), + static_cast(param.epsilon), req[0]); }); } @@ -1794,15 +1796,44 @@ struct FtrlParam : public dmlc::Parameter { } }; +struct FtrlUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* n_data, DType* z_data, const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType beta, const DType lamda1, + const DType lr, const DType wd, + const OpReqType req) { + using namespace mshadow_op; + + const DType grad_rescaled = grad_data[i] * rescale_grad; + if (clip_gradient >= 0.0f) { + z_data[i] += clip::Map(grad_rescaled, clip_gradient) - + (square_root::Map(n_data[i] + + square::Map(clip::Map(grad_rescaled, clip_gradient))) - + square_root::Map(n_data[i])) * weight_data[i] / lr; + n_data[i] += square::Map(clip::Map(grad_rescaled, clip_gradient)); + } else { + z_data[i] += grad_rescaled - (square_root::Map(n_data[i] + + square::Map(grad_rescaled)) - square_root::Map(n_data[i])) * + weight_data[i] / lr; + n_data[i] += square::Map(grad_rescaled); + } + KERNEL_ASSIGN(out_data[i], req, + (sign::Map(z_data[i]) * lamda1 - z_data[i]) / + ((beta + square_root::Map(n_data[i])) / lr + wd) * + gt::Map(abs::Map(z_data[i]), lamda1)); + } +}; + template inline void FtrlUpdate(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; + const FtrlParam& param = nnvm::get(attrs.parsed); Stream* s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { @@ -1812,23 +1843,11 @@ inline void FtrlUpdate(const nnvm::NodeAttrs& attrs, Tensor n = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - grad = scalar(param.rescale_grad) * grad; - - if (param.clip_gradient >= 0.0f) { - z += F(grad, DType(param.clip_gradient)) - (F(n + - F(F(grad, DType(param.clip_gradient)))) - F(n)) * - weight / scalar(param.lr); - n += F(F(grad, DType(param.clip_gradient))); - } else { - z += grad - (F(n + F(grad)) - F(n)) * - weight / scalar(param.lr); - n += F(grad); - } - Assign(out, req[0], - (F(z) * scalar(param.lamda1) - z) / - ((scalar(param.beta) + F(n)) / - scalar(param.lr) + scalar(param.wd)) * - F(F(z), scalar(param.lamda1))); + Kernel::Launch(s, weight.shape_.Size(), + out.dptr_, n.dptr_, z.dptr_, weight.dptr_, grad.dptr_, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.beta), static_cast(param.lamda1), + static_cast(param.lr), static_cast(param.wd), req[0]); }); } From 281e3b2e248ba06b69d20034dd47b987f4147e9c Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 7 Aug 2019 22:08:38 +0530 Subject: [PATCH 05/19] add test for ftrl_update --- tests/python/unittest/test_ndarray.py | 70 ++++++++++++++++----------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index bf8620460bd9..168e8a989c1e 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -28,7 +28,7 @@ from mxnet.test_utils import default_context from mxnet.test_utils import np_reduce from mxnet.test_utils import same -from mxnet.test_utils import random_sample, rand_shape_nd +from mxnet.test_utils import random_sample, rand_shape_nd, random_arrays from mxnet import runtime from numpy.testing import assert_allclose import mxnet.autograd @@ -1800,35 +1800,49 @@ def check_save_load(save_is_np_shape, load_is_np_shape, shapes, save_throw_excep @with_seed() -def test_adam_update_mutate(): - def assert_mutate(x, y): return \ - np.testing.assert_raises( - AssertionError, np.testing.assert_allclose, x, y) - - def assert_unchanged(x, y): return \ - np.testing.assert_allclose(x, y) - - for dim in range(1, 7): - shape = rand_shape_nd(dim) - weight = mx.nd.random.normal(shape=shape) - grad = mx.nd.random.normal(shape=shape) - mean = mx.nd.random.normal(shape=shape) - var = mx.nd.random.normal(shape=shape) - - pre_weight, pre_grad, pre_mean, pre_var = map( - lambda x: x.asnumpy(), [weight, grad, mean, var]) - - # Operate - mx.nd.adam_update(weight, grad, mean, var, out=weight, lr=0.01, wd=1e-3) +def test_update_ops_mutation(): + def assert_mutate(x, y, op): + np.testing.assert_raises( + AssertionError, np.testing.assert_allclose, x, y) + + def assert_unchanged(x, y, op): + np.testing.assert_allclose(x, y) + + def test_op(op, num_inputs, mutated_inputs, **kwargs): + for dim in range(1, 7): + shape = rand_shape_nd(dim) + shapes = (shape,) * num_inputs + + # Generate Arrays + arrays = tuple(map(mx.nd.array, random_arrays(*shapes))) + + # Arrays before update + pre_arrays = tuple(map( + lambda x: x.asnumpy(), arrays)) + + # Operate + # weight -> arrays[0] + op(*arrays, out=arrays[0], **kwargs) + + # Arrays post update + post_arrays = tuple(map( + lambda x: x.asnumpy(), arrays)) + + for idx, (pre_array, post_array) in \ + enumerate(zip(pre_arrays, post_arrays)): + if idx in mutated_inputs: + assert_mutate(pre_array, post_array, op) + else: + assert_unchanged(pre_array, post_array, op) - post_weight, post_grad, post_mean, post_var = map( - lambda x: x.asnumpy(), [weight, grad, mean, var]) + test_op(mx.nd.ftrl_update, 4, [0, 2, 3], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + test_op(mx.nd.adam_update, 4, [0, 2, 3], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) - # Assertions - assert_mutate(pre_weight, post_weight) - assert_mutate(pre_mean, post_mean) - assert_mutate(pre_var, post_var) - assert_unchanged(pre_grad, post_grad) + # Currently fails. + # test_op(mx.nd.rmsprop_update, 3, [0, 2],**{'rescale_grad':0.1, 'lr':0.01, 'wd':1e-3}) + # test_op(mx.nd.rmspropalex_update, 5, [0, 2, 3, 4], **{'rescale_grad':0.1, 'lr':0.01, 'wd':1e-3}) if __name__ == '__main__': From c135c9d3e0fd0e9a034a741fc830bdd06e631155 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Fri, 9 Aug 2019 09:06:21 +0530 Subject: [PATCH 06/19] fix grad mutate for rmspropalex_update --- src/operator/optimizer_op-inl.h | 101 +++++++++++++++++++------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 52f15c7a9a3d..e61b85712965 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1611,57 +1611,76 @@ struct RMSPropAlexParam : public dmlc::Parameter { } }; +struct RMSPropAlexUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* state_n_data, DType* state_g_data, DType* delta_data, + const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType gamma1, const DType gamma2, + const DType lr, const DType wd, + const DType clip_weights, const DType epsilon, + const OpReqType req) { + using namespace mshadow_op; + + const DType rescaled_grad = rescale_grad * grad_data[i] + + wd * weight_data[i]; + + if (clip_gradient >= 0.0f) { + state_n_data[i] = (1.f - gamma1) * + clip::Map(rescaled_grad, clip_gradient) * + clip::Map(rescaled_grad, clip_gradient) + + gamma1 * state_n_data[i]; + state_g_data[i] = (1.f - gamma1) * + clip::Map(rescaled_grad, clip_gradient) + + gamma1 * state_g_data[i]; + delta_data[i] = gamma2 * delta_data[i] - + lr * (clip::Map(rescaled_grad, clip_gradient) / + (square_root::Map(state_n_data[i] - + state_g_data[i] * state_g_data[i] + epsilon))); + } else { + state_n_data[i] = (1.f - gamma1) * rescaled_grad * rescaled_grad + + gamma1 * state_n_data[i]; + state_g_data[i] = (1.f - gamma1) * rescaled_grad + + gamma1 * state_g_data[i]; + delta_data[i] = gamma2 * delta_data[i] - + (lr * (rescaled_grad) / + (square_root::Map(state_n_data[i] - + state_g_data[i] * state_g_data[i] + epsilon))); + } + + if (clip_weights >= 0.0f) { + const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights); + Assign(out_data[i], req, clipped_weight); + } else { + Assign(out_data[i], req, weight_data[i] + delta_data[i]); + } + } +}; + template inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const RMSPropAlexParam ¶m = nnvm::get(attrs.parsed); Stream *s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - Tensor weight = inputs[0].FlatTo2D(s); - Tensor grad = inputs[1].FlatTo2D(s); - Tensor state_n = inputs[2].FlatTo2D(s); - Tensor state_g = inputs[3].FlatTo2D(s); - Tensor delta = inputs[4].FlatTo2D(s); - Tensor out = outputs[0].FlatTo2D(s); - - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - state_n = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_n; - state_g = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_g; - delta = scalar(param.gamma2) * delta - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n - state_g * state_g + - scalar(param.epsilon)))); - } else { - state_n = scalar(1.f - param.gamma1) * (grad * grad) + - scalar(param.gamma1) * state_n; - state_g = scalar(1.f - param.gamma1) * grad + - scalar(param.gamma1) * state_g; - delta = scalar(param.gamma2) * delta - - scalar(param.lr) * - (grad / (F(state_n - state_g * state_g + - scalar(param.epsilon)))); - } + DType* weight_data = inputs[0].dptr(); + DType* grad_data = inputs[1].dptr(); + DType* state_n_data = inputs[2].dptr(); + DType* state_g_data = inputs[3].dptr(); + DType* delta_data = inputs[4].dptr(); + DType* out_data = outputs[0].dptr(); - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], F(weight + delta, DType(param.clip_weights))); - } else { - Assign(out, req[0], weight + delta); - } + Kernel::Launch(s, inputs[0].shape_.Size(), + out_data, state_n_data, state_g_data, delta_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.gamma1), static_cast(param.gamma2), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.clip_weights), static_cast(param.epsilon), req[0]); }); } From f92a5f610de10e9286753a916961d11a57cce3ea Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Fri, 9 Aug 2019 09:06:58 +0530 Subject: [PATCH 07/19] add test for rmspropalex_update --- tests/python/unittest/test_ndarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index 168e8a989c1e..23a40a8025f2 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -1839,10 +1839,10 @@ def test_op(op, num_inputs, mutated_inputs, **kwargs): {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) test_op(mx.nd.adam_update, 4, [0, 2, 3], ** {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) - + test_op(mx.nd.rmspropalex_update, 5, [ + 0, 2, 3, 4], **{'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) # Currently fails. # test_op(mx.nd.rmsprop_update, 3, [0, 2],**{'rescale_grad':0.1, 'lr':0.01, 'wd':1e-3}) - # test_op(mx.nd.rmspropalex_update, 5, [0, 2, 3, 4], **{'rescale_grad':0.1, 'lr':0.01, 'wd':1e-3}) if __name__ == '__main__': From 267201b63abafae6ebb4b590f199173f6637fe23 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Fri, 9 Aug 2019 10:17:48 +0530 Subject: [PATCH 08/19] use KERNEL_ASSIGN in RMSPropAlexUpdateKernel. --- src/operator/optimizer_op-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index e61b85712965..419eb4a65811 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1651,9 +1651,9 @@ struct RMSPropAlexUpdateKernel { if (clip_weights >= 0.0f) { const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights); - Assign(out_data[i], req, clipped_weight); + KERNEL_ASSIGN(out_data[i], req, clipped_weight); } else { - Assign(out_data[i], req, weight_data[i] + delta_data[i]); + KERNEL_ASSIGN(out_data[i], req, weight_data[i] + delta_data[i]); } } }; From a80564b554becd24e5025483d6481e1a459a5e05 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Sat, 10 Aug 2019 01:06:33 +0530 Subject: [PATCH 09/19] fix grad mutate for rmsprop_update --- src/operator/optimizer_op-inl.h | 101 ++++++++++++++++---------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 419eb4a65811..4a23a40bb78e 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1722,64 +1722,67 @@ struct RMSPropParam : public dmlc::Parameter { } }; +struct RMSPropUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, + DType* out_data, DType* state_n_data, + const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType gamma1, const DType lr, const DType wd, + const DType clip_weights, const DType epsilon, + const OpReqType req) { + using namespace mshadow_op; + + const DType rescaled_grad = rescale_grad * grad_data[i] + wd * weight_data[i]; + + if (clip_gradient >= 0.0f) { + const DType clipped_grad = clip::Map(rescaled_grad, clip_gradient); + state_n_data[i] = (1.f - gamma1) * (clipped_grad * clipped_grad) + gamma1 * state_n_data[i]; + if (clip_weights >= 0.0f) { + KERNEL_ASSIGN(out_data[i], req, + clip::Map(weight_data[i] - + lr * clipped_grad / (square_root::Map(state_n_data[i] + epsilon)), + clip_weights)); + } else { + KERNEL_ASSIGN(out_data[i], req, + weight_data[i] - + lr * clipped_grad / (square_root::Map(state_n_data[i] + epsilon))); + } + } else { + state_n_data[i] = (1.f - gamma1) * (rescaled_grad * rescaled_grad) + gamma1 * state_n_data[i]; + if (clip_weights >= 0.0f) { + KERNEL_ASSIGN(out_data[i], req, + clip::Map(weight_data[i] - + lr * (rescaled_grad / square_root::Map(state_n_data[i] + epsilon)), + clip_weights)); + } else { + KERNEL_ASSIGN(out_data[i], req, + weight_data[i] - + lr * (rescaled_grad / (square_root::Map(state_n_data[i] + epsilon)))); + } + } + } +}; + template inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const RMSPropParam ¶m = nnvm::get(attrs.parsed); Stream *s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - Tensor weight = inputs[0].FlatTo2D(s); - Tensor grad = inputs[1].FlatTo2D(s); - Tensor state_n = inputs[2].FlatTo2D(s); - Tensor out = outputs[0].FlatTo2D(s); + DType* weight_data = inputs[0].dptr(); + DType* grad_data = inputs[1].dptr(); + DType* state_n_data = inputs[2].dptr(); + DType* out_data = outputs[0].dptr(); - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - state_n = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_n; - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], - F(weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n + - scalar(param.epsilon)))), - DType(param.clip_weights))); - } else { - Assign(out, req[0], weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n + - scalar(param.epsilon))))); - } - } else { - state_n = scalar(1.f - param.gamma1) * (grad * grad) + - scalar(param.gamma1) * state_n; - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], - F(weight - - scalar(param.lr) * - (grad / - (F(state_n + - scalar(param.epsilon)))), - DType(param.clip_weights))); - } else { - Assign(out, req[0], weight - - scalar(param.lr) * - (grad / - (F(state_n + - scalar(param.epsilon))))); - } - } + Kernel::Launch(s, inputs[0].shape_.Size(), + out_data, state_n_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.gamma1), static_cast(param.lr), static_cast(param.wd), + static_cast(param.clip_weights), static_cast(param.epsilon), req[0]); }); } From 9de7d215405728c935206874acc72f4483ae91dc Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Sat, 10 Aug 2019 01:07:58 +0530 Subject: [PATCH 10/19] add test for rmsprop_update --- tests/python/unittest/test_ndarray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index 23a40a8025f2..465fedb79e05 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -1839,10 +1839,10 @@ def test_op(op, num_inputs, mutated_inputs, **kwargs): {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) test_op(mx.nd.adam_update, 4, [0, 2, 3], ** {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) - test_op(mx.nd.rmspropalex_update, 5, [ - 0, 2, 3, 4], **{'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) - # Currently fails. - # test_op(mx.nd.rmsprop_update, 3, [0, 2],**{'rescale_grad':0.1, 'lr':0.01, 'wd':1e-3}) + test_op(mx.nd.rmspropalex_update, 5, [0, 2, 3, 4], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + test_op(mx.nd.rmsprop_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) if __name__ == '__main__': From 05844ac40747924930fa436926d23b527008efad Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Sat, 10 Aug 2019 12:07:20 +0530 Subject: [PATCH 11/19] add more optimizers for mutation test --- tests/python/unittest/test_ndarray.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index 465fedb79e05..8aa43b4a553f 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -1835,6 +1835,25 @@ def test_op(op, num_inputs, mutated_inputs, **kwargs): else: assert_unchanged(pre_array, post_array, op) + test_op(mx.nd.signsgd_update, 2, [0], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_gradient': 1e-3}) + test_op(mx.nd.signum_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 1e-3, 'clip_gradient': 1e-3, + 'wd_lh': 1e-3}) + test_op(mx.nd.sgd_update, 2, [0], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_gradient': 1e-3}) + test_op(mx.nd.sgd_mom_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 0.01, 'clip_gradient': 1e-3}) + test_op(mx.nd.nag_mom_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 0.01, 'clip_gradient': 1e-3}) + test_op(mx.nd.ftml_update, 5, [0, 2, 3, 4], ** + {'t': 3, 'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_grad': 1e-3}) test_op(mx.nd.ftrl_update, 4, [0, 2, 3], ** {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) test_op(mx.nd.adam_update, 4, [0, 2, 3], ** From 6ef7408ce008177ba03d82c40f1705bec63bf519 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Tue, 20 Aug 2019 23:10:56 +0530 Subject: [PATCH 12/19] retrigger CI From 30dffdb103d10de362a6cd53b0b0a10af625ca51 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 21 Aug 2019 22:20:19 +0530 Subject: [PATCH 13/19] retrigger CI From 91227bfffca0f27f0118917e4e0458546d85516b Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Tue, 27 Aug 2019 08:11:00 +0530 Subject: [PATCH 14/19] retrigger CI From 932e7c44971212dc832396281f61fc91ef166e9f Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 28 Aug 2019 21:56:08 +0530 Subject: [PATCH 15/19] retrigger CI From 504839f1f6df7fb69d8931e1df3a1d9793aeadab Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Fri, 30 Aug 2019 22:51:52 +0530 Subject: [PATCH 16/19] address comments. * refactor code. --- src/operator/optimizer_op-inl.h | 119 ++++++++++++-------------------- 1 file changed, 43 insertions(+), 76 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 4a23a40bb78e..3c03f8061d87 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1303,18 +1303,15 @@ struct AdamUpdateKernel { const DType epsilon, const OpReqType req) { using namespace mshadow_op; - const DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd; - if (clip_gradient >= 0.0f) { - mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * - clip::Map(grad_rescaled, clip_gradient); - var_data[i] = beta2 * var_data[i] + (1.f - beta2) * square::Map( - clip::Map(grad_rescaled, clip_gradient)); - } else { - mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled; - var_data[i] = beta2 * var_data[i] + - (1.f - beta2) * grad_rescaled * grad_rescaled; + DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd; + if (clip_gradient >= 0.f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); } + mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled; + var_data[i] = beta2 * var_data[i] + + (1.f - beta2) * grad_rescaled * grad_rescaled; + KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] / (square_root::Map(var_data[i]) + epsilon)); } @@ -1623,32 +1620,20 @@ struct RMSPropAlexUpdateKernel { const OpReqType req) { using namespace mshadow_op; - const DType rescaled_grad = rescale_grad * grad_data[i] + - wd * weight_data[i]; - + DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i]; if (clip_gradient >= 0.0f) { - state_n_data[i] = (1.f - gamma1) * - clip::Map(rescaled_grad, clip_gradient) * - clip::Map(rescaled_grad, clip_gradient) + - gamma1 * state_n_data[i]; - state_g_data[i] = (1.f - gamma1) * - clip::Map(rescaled_grad, clip_gradient) + - gamma1 * state_g_data[i]; - delta_data[i] = gamma2 * delta_data[i] - - lr * (clip::Map(rescaled_grad, clip_gradient) / - (square_root::Map(state_n_data[i] - - state_g_data[i] * state_g_data[i] + epsilon))); - } else { - state_n_data[i] = (1.f - gamma1) * rescaled_grad * rescaled_grad + - gamma1 * state_n_data[i]; - state_g_data[i] = (1.f - gamma1) * rescaled_grad + - gamma1 * state_g_data[i]; - delta_data[i] = gamma2 * delta_data[i] - - (lr * (rescaled_grad) / - (square_root::Map(state_n_data[i] - - state_g_data[i] * state_g_data[i] + epsilon))); + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); } + state_n_data[i] = (1.f - gamma1) * grad_rescaled * grad_rescaled + + gamma1 * state_n_data[i]; + state_g_data[i] = (1.f - gamma1) * grad_rescaled + + gamma1 * state_g_data[i]; + delta_data[i] = gamma2 * delta_data[i] - + (lr * (grad_rescaled) / + (square_root::Map(state_n_data[i] - + state_g_data[i] * state_g_data[i] + epsilon))); + if (clip_weights >= 0.0f) { const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights); KERNEL_ASSIGN(out_data[i], req, clipped_weight); @@ -1733,34 +1718,19 @@ struct RMSPropUpdateKernel { const OpReqType req) { using namespace mshadow_op; - const DType rescaled_grad = rescale_grad * grad_data[i] + wd * weight_data[i]; - + DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i]; if (clip_gradient >= 0.0f) { - const DType clipped_grad = clip::Map(rescaled_grad, clip_gradient); - state_n_data[i] = (1.f - gamma1) * (clipped_grad * clipped_grad) + gamma1 * state_n_data[i]; - if (clip_weights >= 0.0f) { - KERNEL_ASSIGN(out_data[i], req, - clip::Map(weight_data[i] - - lr * clipped_grad / (square_root::Map(state_n_data[i] + epsilon)), - clip_weights)); - } else { - KERNEL_ASSIGN(out_data[i], req, - weight_data[i] - - lr * clipped_grad / (square_root::Map(state_n_data[i] + epsilon))); - } - } else { - state_n_data[i] = (1.f - gamma1) * (rescaled_grad * rescaled_grad) + gamma1 * state_n_data[i]; - if (clip_weights >= 0.0f) { - KERNEL_ASSIGN(out_data[i], req, - clip::Map(weight_data[i] - - lr * (rescaled_grad / square_root::Map(state_n_data[i] + epsilon)), - clip_weights)); - } else { - KERNEL_ASSIGN(out_data[i], req, - weight_data[i] - - lr * (rescaled_grad / (square_root::Map(state_n_data[i] + epsilon)))); - } + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); } + + state_n_data[i] = (1.f - gamma1) * (grad_rescaled * grad_rescaled) + gamma1 * state_n_data[i]; + + DType weight = weight_data[i] - + lr * (grad_rescaled / square_root::Map(state_n_data[i] + epsilon)); + if (clip_weights >= 0.0f) { + weight = clip::Map(weight, clip_weights); + } + KERNEL_ASSIGN(out_data[i], req, weight); } }; @@ -1828,23 +1798,20 @@ struct FtrlUpdateKernel { const OpReqType req) { using namespace mshadow_op; - const DType grad_rescaled = grad_data[i] * rescale_grad; - if (clip_gradient >= 0.0f) { - z_data[i] += clip::Map(grad_rescaled, clip_gradient) - - (square_root::Map(n_data[i] + - square::Map(clip::Map(grad_rescaled, clip_gradient))) - - square_root::Map(n_data[i])) * weight_data[i] / lr; - n_data[i] += square::Map(clip::Map(grad_rescaled, clip_gradient)); - } else { - z_data[i] += grad_rescaled - (square_root::Map(n_data[i] + - square::Map(grad_rescaled)) - square_root::Map(n_data[i])) * - weight_data[i] / lr; - n_data[i] += square::Map(grad_rescaled); - } - KERNEL_ASSIGN(out_data[i], req, - (sign::Map(z_data[i]) * lamda1 - z_data[i]) / - ((beta + square_root::Map(n_data[i])) / lr + wd) * - gt::Map(abs::Map(z_data[i]), lamda1)); + DType grad_rescaled = grad_data[i] * rescale_grad; + if (clip_gradient >= 0.0f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + + z_data[i] += grad_rescaled - (square_root::Map(n_data[i] + + square::Map(grad_rescaled)) - square_root::Map(n_data[i])) * + weight_data[i] / lr; + n_data[i] += square::Map(grad_rescaled); + + KERNEL_ASSIGN(out_data[i], req, + (sign::Map(z_data[i]) * lamda1 - z_data[i]) / + ((beta + square_root::Map(n_data[i])) / lr + wd) * + gt::Map(abs::Map(z_data[i]), lamda1)); } }; From 8ee84cc401030c8df77a868e97cd2c00ed0fa086 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Sat, 31 Aug 2019 10:18:52 +0530 Subject: [PATCH 17/19] retrigger CI From 08ff57f2d4a599f4db52d7a62ebc1a46110dc778 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 4 Sep 2019 22:12:59 +0530 Subject: [PATCH 18/19] retrigger CI From 7fc9d0cf25c2b2a00bfc85ff7cd7607ba26699da Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Thu, 5 Sep 2019 19:33:39 +0530 Subject: [PATCH 19/19] retrigger CI