apache · eric-haibin-lin · Jan 24, 2019 · Nov 20, 2018 · Nov 21, 2018 · Nov 26, 2018
@@ -97,7 +97,8 @@ class Arg:
         'double':'double',\
         'double or None':'dmlc::optional<double>',\
         'Shape or None':'dmlc::optional<Shape>',\
-        'string':'const std::string&'}
+        'string':'const std::string&',\
+        'tuple of <float>':'nnvm::Tuple<mx_float>'}
     name = ''
     type = ''
     description = ''
@@ -407,6 +408,7 @@ def ParseAllOps():
                       "#include \"mxnet-cpp/op_util.h\"\n"
                       "#include \"mxnet-cpp/operator.h\"\n"
                       "#include \"dmlc/optional.h\"\n"
+                      "#include \"nnvm/tuple.h\"\n"
                       "\n"
                       "namespace mxnet {\n"
                       "namespace cpp {\n"

@@ -145,6 +145,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
     when kvstore's type is `device`.
 
+* MXNET_UPDATE_ON_KVSTORE
+  - Values: 0(false) or 1(true) ```(default=1)```
+  - If true, weight updates are performed during the communication step, if possible.
+
 ## Memonger
 
 * MXNET_BACKWARD_DO_MIRROR
@@ -218,6 +222,11 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
   - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
 
+* MXNET_OPTIMIZER_AGGREGATION_SIZE
+  - Values: Int ```(default=4)```
+  - Maximum value is 60.
+  - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```

@@ -393,6 +393,8 @@ def update(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
+        updates = [[] for _ in self._updaters]
+
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
@@ -416,11 +418,17 @@ def _update(self, ignore_stale_grad=False):
                     self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
-            for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
+            for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
-                    upd(i, grad, arr)
+                    upd.append((i, grad, arr))
                     arr._fresh_grad = False
 
+        if not (self._kvstore and self._update_on_kvstore):
+            for updater, upd in zip(self._updaters, updates):
+                if upd:
+                    i, w, g = zip(*upd)
+                    updater(i, w, g)
+
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 

@@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to `NDArray`.
         Model parameter, dict of name to `NDArray` of net's weights.
     """
-    update_on_kvstore = True
+    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
         kv = kvstore
     elif isinstance(kvstore, str):
         # create kvstore using the string type
-        if num_device is 1 and 'dist' not in kvstore:
+        if num_device == 1 and 'dist' not in kvstore:
             # no need to use kv for single device and single machine
             kv = None
         else:
@@ -162,6 +162,7 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
+    updates = [[] for _ in range(num_device)]
     for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
@@ -178,7 +179,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
             w, g = p
-            updater(index*num_device+k, g, w)
+            updates[k].append((index*num_device+k, g, w))
+    for dev_updates in updates:
+        i, w, g = zip(*dev_updates)
+        updater(i, w, g)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):