Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Aggregate SGD #13346

Merged
merged 16 commits into from
Jan 24, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp-package/scripts/OpWrapperGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class Arg:
'double':'double',\
'double or None':'dmlc::optional<double>',\
'Shape or None':'dmlc::optional<Shape>',\
'string':'const std::string&'}
'string':'const std::string&',\
'tuple of <float>':'nnvm::Tuple<mx_float>'}
name = ''
type = ''
description = ''
Expand Down Expand Up @@ -407,6 +408,7 @@ def ParseAllOps():
"#include \"mxnet-cpp/op_util.h\"\n"
"#include \"mxnet-cpp/operator.h\"\n"
"#include \"dmlc/optional.h\"\n"
"#include \"nnvm/tuple.h\"\n"
"\n"
"namespace mxnet {\n"
"namespace cpp {\n"
Expand Down
9 changes: 9 additions & 0 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
- If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
when kvstore's type is `device`.

* MXNET_UPDATE_ON_KVSTORE
- Values: 0(false) or 1(true) ```(default=1)```
- If true, weight updates are performed during the communication step, if possible.

## Memonger

* MXNET_BACKWARD_DO_MIRROR
Expand Down Expand Up @@ -218,6 +222,11 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
- When the array size is bigger than or equal to this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
- When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.

* MXNET_OPTIMIZER_AGGREGATION_SIZE
- Values: Int ```(default=4)```
- Maximum value is 60.
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
- This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).

Settings for Minimum Memory Usage
---------------------------------
- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
Expand Down
12 changes: 10 additions & 2 deletions python/mxnet/gluon/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ def update(self, batch_size, ignore_stale_grad=False):
self._update(ignore_stale_grad)

def _update(self, ignore_stale_grad=False):
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
updates = [[] for _ in self._updaters]

for i, param in enumerate(self._params):
if param.grad_req == 'null':
continue
Expand All @@ -416,11 +418,17 @@ def _update(self, ignore_stale_grad=False):
self._kvstore.pull(i, param.list_data(), priority=-i)
continue

for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
if not ignore_stale_grad or arr._fresh_grad:
upd(i, grad, arr)
upd.append((i, grad, arr))
arr._fresh_grad = False

if not (self._kvstore and self._update_on_kvstore):
for updater, upd in zip(self._updaters, updates):
if upd:
i, w, g = zip(*upd)
updater(i, w, g)

def save_states(self, fname):
"""Saves trainer states (e.g. optimizer, momentum) to a file.

Expand Down
10 changes: 7 additions & 3 deletions python/mxnet/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
arg_params : dict of str to `NDArray`.
Model parameter, dict of name to `NDArray` of net's weights.
"""
update_on_kvstore = True
update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
if kvstore is None:
kv = None
elif isinstance(kvstore, kvs.KVStore):
kv = kvstore
elif isinstance(kvstore, str):
# create kvstore using the string type
if num_device is 1 and 'dist' not in kvstore:
if num_device == 1 and 'dist' not in kvstore:
# no need to use kv for single device and single machine
kv = None
else:
Expand Down Expand Up @@ -162,6 +162,7 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
def _update_params(param_arrays, grad_arrays, updater, num_device,
kvstore=None, param_names=None):
"""Perform update of param_arrays from grad_arrays not on kvstore."""
updates = [[] for _ in range(num_device)]
for i, pair in enumerate(zip(param_arrays, grad_arrays)):
arg_list, grad_list = pair
if grad_list[0] is None:
Expand All @@ -178,7 +179,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
# state for the same index but on diff devs, TODO(mli)
# use a better solution later
w, g = p
updater(index*num_device+k, g, w)
updates[k].append((index*num_device+k, g, w))
for dev_updates in updates:
i, w, g = zip(*dev_updates)
updater(i, w, g)


def _multiple_callbacks(callbacks, *args, **kwargs):
Expand Down
Loading