From b7ee7d85f8f1187e4e84572c1b0fb0fca404756c Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Sat, 18 Jan 2020 16:55:37 -0800
Subject: [PATCH] fix

---
 python/mxnet/gluon/trainer.py                   | 13 +++++++------
 python/mxnet/optimizer/lars.py                  |  5 ++---
 python/mxnet/optimizer/optimizer.py             |  5 +++--
 python/mxnet/optimizer/sgd.py                   |  5 ++---
 tests/python/unittest/test_contrib_optimizer.py |  4 +++-
 tests/python/unittest/test_optimizer.py         |  6 ++++--
 6 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 601a64ec9a58..85e7409cde92 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,9 +60,11 @@ class Trainer(object):
         Arguments would then be {'type':'2bit', 'threshold':0.5}
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
-        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        Whether to perform parameter updates on kvstore. If None and optimizer.aggregate_num <= 1,
+        then trainer will choose the more suitable option depending on the type of kvstore.
+        If None and optimizer.aggregate_num > 1, `update_on_kvstore` is set to False.
+        If the `update_on_kvstore` argument is provided,
+        environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -107,9 +109,8 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
             if update_on_kvstore:
                 raise ValueError("Cannot set update_on_kvstore=True "
                                  "when optimizer.aggregate_num > 1.")
-        if update_on_kvstore is None:
-            if self._optimizer.aggregate_num > 1:
-                update_on_kvstore = False
+        if update_on_kvstore is None and self._optimizer.aggregate_num > 1:
+            update_on_kvstore = False
         self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
         self._kvstore = None
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
index af88eee4bef6..1cd746c6dd32 100644
--- a/python/mxnet/optimizer/lars.py
+++ b/python/mxnet/optimizer/lars.py
@@ -64,10 +64,9 @@ class LARS(Optimizer):
     lazy_update : bool, default False
         Default is False. If True, lazy updates are applied \
         if the storage types of weight and grad are both ``row_sparse``.
-    aggregate_num : int, default 4
+    aggregate_num : int, default 1
         Number of weights to be aggregated in a list.
         They are passed to the optimizer for a single optimization step.
-        In default, all the weights are aggregated.
     use_fused_step : bool, default True
         Whether or not to use fused kernels for optimizer.
         When use_fused_step=False, step is called,
@@ -75,7 +74,7 @@ class LARS(Optimizer):
     """
     def __init__(self, learning_rate=0.1, momentum=0.0, eta=0.001,
                  epsilon=1e-8, lazy_update=False, use_fused_step=True,
-                 aggregate_num=4, **kwargs):
+                 aggregate_num=1, **kwargs):
         super(LARS, self).__init__(learning_rate=learning_rate,
                                    use_fused_step=use_fused_step,
                                    aggregate_num=aggregate_num,
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 1011a1ae692f..b5e8c2468304 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -77,7 +77,8 @@ class Optimizer(object):
     aggregate_num : int, optional, default None
         Number of weights to be aggregated in a list.
         They are passed to the optimizer for a single optimization step.
-        In default, all the weights are aggregated.
+        In default, only one weight is aggregated.
+        When `aggregate_num` is set to numpy.inf, all the weights are aggregated.
 
     use_fused_step : bool, optional, default None
         Whether or not to use fused kernels for optimizer.
@@ -118,7 +119,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.multi_precision = multi_precision
 
         if aggregate_num is None:
-            self.aggregate_num = numpy.inf
+            self.aggregate_num = 1
         else:
             self.aggregate_num = aggregate_num
 
diff --git a/python/mxnet/optimizer/sgd.py b/python/mxnet/optimizer/sgd.py
index 8bfe3c63bb9c..3e0f74928182 100644
--- a/python/mxnet/optimizer/sgd.py
+++ b/python/mxnet/optimizer/sgd.py
@@ -87,17 +87,16 @@ class SGD(Optimizer):
         True: makes internal 32-bit copy of the weights and applies gradients
         in 32-bit precision even if actual weights used in the model have lower precision.
         Turning this on can improve convergence and accuracy when training with float16.
-    aggregate_num : int, default 4
+    aggregate_num : int, default 1
         Number of weights to be aggregated in a list.
         They are passed to the optimizer for a single optimization step.
-        In default, all the weights are aggregated.
     use_fused_step : bool, default True
         Whether or not to use fused kernels for optimizer.
         When use_fused_step=False, step is called,
         otherwise, fused_step is called.
     """
     def __init__(self, learning_rate=0.1, momentum=0.0, lazy_update=False,
-                 multi_precision=False, use_fused_step=True, aggregate_num=4, **kwargs):
+                 multi_precision=False, use_fused_step=True, aggregate_num=1, **kwargs):
         super(SGD, self).__init__(learning_rate=learning_rate,
                                   multi_precision=multi_precision,
                                   aggregate_num=aggregate_num,
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 7a6cb6afea1c..5f7c51f257b3 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -33,8 +33,10 @@ def test_group_adagrad():
     eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for options in itertools.product(eps_options, cg_options, rg_options):
+        for options in itertools.product(eps_options, cg_options, rg_options, agg_options):
             kwarg = dict(wd=0.0)
             for option in options:
                 kwarg.update(option)
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 800a1fb6c926..6137fd9d65df 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -686,7 +686,8 @@ def test_ftrl():
                     ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
             compare_optimizer(opt1(use_fused_step=False, **kwarg),
-                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=1e-4)
 
 
 @with_seed()
@@ -710,7 +711,8 @@ def test_sparse_ftrl():
                     ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes,
-                              dtype, w_stype='row_sparse', g_stype='row_sparse')
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=1e-4)
 
 
 @with_seed()