Skip to content

Commit

Permalink
Fix update_on_kvstore option for distributed training
Browse files Browse the repository at this point in the history
fix rescale grad for update_on_kvstore
  • Loading branch information
Ubuntu committed Sep 17, 2018
1 parent b3be92f commit 856b503
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions python/mxnet/gluon/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def _init_kvstore(self):
if self._distributed:
# kv.pull(row_sparse_grad) is not supported for dist kvstore
update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad \
or 'async' in kvstore.type
or 'async' in kvstore.type or config['update_on_kvstore']
if update_on_kvstore:
# optimizer preferably needs to be set before init for multiprecision
kvstore.set_optimizer(self._optimizer)
Expand Down Expand Up @@ -269,13 +269,12 @@ def step(self, batch_size, ignore_stale_grad=False):
If true, ignores Parameters with stale gradient (gradient that has not
been updated by `backward` after last step) and skip update.
"""
self._optimizer.rescale_grad = self._scale / batch_size
if not self._kv_initialized:
self._init_kvstore()
if self._params_to_init:
self._init_params()

self._optimizer.rescale_grad = self._scale / batch_size

self._allreduce_grads()
self._update(ignore_stale_grad)

Expand Down

0 comments on commit 856b503

Please sign in to comment.