apache · szha · Nov 7, 2018 · Oct 10, 2018 · Oct 22, 2018 · Oct 24, 2018
@@ -194,14 +194,18 @@ def _init_kvstore(self):
 
             if config['update_on_kvstore'] is not None:
                 update_on_kvstore = config['update_on_kvstore']
+
         if kvstore:
             if self._compression_params:
                 kvstore.set_gradient_compression(self._compression_params)
             self._distributed = 'dist' in kvstore.type
             if self._distributed:
                 # kv.pull(row_sparse_grad) is not supported for dist kvstore
+                # Captures condition for dist_async, dist_device_sync or based on config for
+                # update_on_kvstore
                 update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad \
-                                    or 'async' in kvstore.type
+                                    or 'device' in kvstore.type or 'async' in kvstore.type \
+                                    or config['update_on_kvstore']
             if update_on_kvstore:
                 # optimizer preferably needs to be set before init for multiprecision
                 kvstore.set_optimizer(self._optimizer)
@@ -269,13 +273,20 @@ def step(self, batch_size, ignore_stale_grad=False):
             If true, ignores Parameters with stale gradient (gradient that has not
             been updated by `backward` after last step) and skip update.
         """
+        rescale_grad = self._scale / batch_size
+        if self._update_on_kvstore and self._distributed and \
+           self._optimizer.rescale_grad != rescale_grad:
+            raise UserWarning('Possible change in the `batch_size` from previous `step` detected.' \
+                            'Optimizer gradient normalizing factor will not change w.r.t new batch_size when ' \
+                            'update_on_kvstore=True and when distributed `kvstore` is used.')
+
+        self._optimizer.rescale_grad = rescale_grad
+
         if not self._kv_initialized:
             self._init_kvstore()
         if self._params_to_init:
             self._init_params()
 
-        self._optimizer.rescale_grad = self._scale / batch_size
-
         self._allreduce_grads()
         self._update(ignore_stale_grad)
 

diff --git a/tests/nightly/dist_device_sync_kvstore.py b/tests/nightly/dist_device_sync_kvstore.py
@@ -90,6 +90,25 @@ def check_init(kv, cur_keys, cur_shape, device=False):
     my_rank = kv.rank
     print('worker ' + str(my_rank) + ' is initialized')
 
+def test_gluon_trainer_type():
+    def check_trainer_kv_update(update_on_kv):
+        params = mx.gluon.ParameterDict()
+        x = params.get('x', shape=(10,1), lr_mult=1.0)
+        params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+        try:
+            trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv, update_on_kvstore=update_on_kv)
+            trainer._init_kvstore()
+            assert trainer._kv_initialized
+            assert trainer._update_on_kvstore is True
+        except ValueError:
+            assert update_on_kv is False
+
+    check_trainer_kv_update(False)
+    check_trainer_kv_update(True)
+    check_trainer_kv_update(None)
+    my_rank = kv.rank
+    print('worker ' + str(my_rank) + ' passed test_gluon_trainer_type')
+
 if __name__ == "__main__":
     test_sync_init()
     test_sync_push_pull()