Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Adadelta optimizer test #13443

Merged
merged 2 commits into from
Dec 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/mxnet/optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ class FTML(Optimizer):
z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
weight = - z / d_t

For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.

This optimizer accepts the following parameters in addition to those accepted
by :class:`.Optimizer`.

Expand Down
73 changes: 72 additions & 1 deletion tests/python/unittest/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

import itertools
import numpy as np
import mxnet as mx
import mxnet.lr_scheduler as lr_scheduler
Expand Down Expand Up @@ -976,8 +977,8 @@ def update(self, index, weight, grad, state):
div = grad / mx.nd.sqrt(history + self.float_stable_eps)
weight[:] += (div + weight * wd) * -lr

@with_seed()
def test_adagrad():
mx.random.seed(0)
opt1 = PyAdaGrad
opt2 = mx.optimizer.AdaGrad
shape = (3, 4, 5)
Expand All @@ -1002,6 +1003,76 @@ def test_adagrad():
compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
g_stype='row_sparse')

# AdaDelta
class PyAdaDelta(mx.optimizer.Optimizer):
"""The python reference of AdaDelta optimizer.

This class implements AdaDelta, an optimizer described in *ADADELTA: An adaptive
learning rate method*, available at https://arxiv.org/abs/1212.5701.

This optimizer updates each weight by::

grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
weight -= (cur_delta + wd * weight)

This optimizer accepts the following parameters in addition to those accepted
by :class:`.Optimizer`.

Parameters
----------
rho: float
Decay rate for both squared gradients and delta.
epsilon : float
Small value to avoid division by 0.
"""
def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
super(PyAdaDelta, self).__init__(**kwargs)
self.rho = rho
self.epsilon = epsilon

def create_state(self, index, weight):
return (mx.nd.zeros(weight.shape, weight.context),
mx.nd.zeros(weight.shape, weight.context))

def update(self, index, weight, grad, state):
self._update_count(index)
wd = self._get_wd(index)

grad *= self.rescale_grad
if self.clip_gradient is not None:
grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)

acc_grad, acc_delta = state

acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
mx.nd.sqrt(acc_grad + self.epsilon)) * grad
acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2

# update weight
weight[:] -= current_delta + wd * weight

@with_seed()
def test_adadelta():
opt1 = PyAdaDelta
opt2 = mx.optimizer.AdaDelta
shape = (3, 4, 5)
rho_options = [{'rho': 0.9}]
eps_options = [{}, {'epsilon': 1e-8}]
cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
wd_options = [{}, {'wd': 0.0}]
for dtype in [np.float16, np.float32]:
anirudhacharya marked this conversation as resolved.
Show resolved Hide resolved
for params in itertools.product(rho_options, eps_options, cg_options,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very clean and nice!
Thanks @larroy and @anirudhacharya

rg_options, wd_options):
kwarg = {k: v for param in params for k, v in param.items()}
if dtype is np.float16:
kwarg.update({'multi_precision': True})
compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)


def test_factor_scheduler():
base_lr = 1
Expand Down