apache · eric-haibin-lin · Apr 7, 2020 · Feb 5, 2020 · Feb 5, 2020 · Feb 7, 2020
@@ -1354,6 +1354,7 @@ integrationtest_ubuntu_gpu_dist_kvstore() {
     ../../tools/launch.py -n 4 --launcher local python dist_device_sync_kvstore_custom.py
     ../../tools/launch.py --p3 -n 4 --launcher local python dist_device_sync_kvstore_custom.py
     ../../tools/launch.py -n 4 --launcher local python dist_sync_kvstore.py --type=init_gpu
+    ../../tools/launch.py -n 1 -s 1 --byteps --env NVIDIA_VISIBLE_DEVICES:0,1 python3 dist_device_sync_kvstore_byteps.py
     popd
 }
 
@@ -1781,7 +1782,7 @@ build_julia_docs() {
    export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
-   julia_doc_path='julia/docs/site/'
+   julia_doc_path='julia/docs/site/'f
    julia_doc_artifact='docs/_build/julia-artifacts.tgz'
 
    echo "Julia will check for MXNet in $MXNET_HOME/lib"

@@ -22,3 +22,4 @@
 from .kvstore import *
 from .base import *
 from .kvstore_server import *
+from .byteps import *
@@ -314,6 +314,8 @@ def pushpull(self, key, value, out=None, priority=0):
     def is_capable(capability):
         """Queries if the KVStore type supports certain capability, such as optimizer algorithm,
         gradient compression, sparsity, etc.
+        If the kvstore does not store weights in server part, then no optimizer is supported,
+        this function will return False.
 
         Parameters
         ----------
@@ -428,9 +430,13 @@ def create(name='local'):
     No two updates happen on the same weight at the same time. However, the order is not
     guaranteed.
 
+    ``byteps``: Use byteps as broadcast/pushpull backend.
+    This kind of kvstore doesn't store weights, thus there won't be optimizer in this kvstore server.
+    Byteps doesn't support pure cpu training, so be sure to enable gpu training when using this kvstore.
+
     Parameters
     ----------
-    name : {'local', 'device', 'nccl', 'dist_sync', 'dist_device_sync', 'dist_async', 'horovod'}
+    name : {'local', 'device', 'nccl', 'dist_sync', 'dist_device_sync', 'dist_async', 'horovod', 'byteps'}
         The type of KVStore.
     Returns
     -------

@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+""" BytePS backend for MXNet KVStore"""
+from __future__ import absolute_import
+
+from ..ndarray import NDArray
+from .base import KVStoreBase
+
+__all__ = ['BytePS']
+
+
+@KVStoreBase.register
+class BytePS(KVStoreBase):
+    """BytePS backend for MXNet KVStore interface."""
+
+    def __init__(self):
+        """Initializes a new KVStore."""
+        try:
+            import byteps.mxnet as bps
+            self.handle = bps
+        except ImportError as err:
+            print('Did not find BytePS library. Please install BytePS first')
+            raise err
+        self.handle.init()
+
+    def broadcast(self, key, value, out, priority=0):
+        """ Broadcast the value NDArray at rank 0 to all ranks' out. If out is None,
+        the result is stored in `value`.
+        Parameters
+        ----------
+        key : str, or int
+            The keys.
+        value : NDArray, or list of NDArray
+            Values corresponding to the key.
+        out : NDArray, or lise of NDArray
+            Values corresponding to the keys.
+        Examples
+        --------
+        >>> # broadcast a single key-value pair
+        >>> shape = (2,3)
+        >>> kv = mx.kv.create('byteps')
+        >>> a = mx.nd.zeros(shape)
+        >>> kv.broadcast('3', mx.nd.ones(shape)*2, out=a)
+        >>> print a.asnumpy()
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+        """
+
+        # do not accept list or tuple for key/value
+        assert isinstance(key, (str, int))
+
+        # unpack the list if it contains just one NDArray
+        value = value[0] if isinstance(
+            value, list) and len(value) == 1 else value
+        assert isinstance(
+            value, NDArray), "The type of value can only be NDArray or list of NDArray which has only one element."
+
+        # for non-root-rank, assign value with 0, thus the result of pushpull will be
+        # equal to the value of root-rank, thus implementing broadcast.
+        root_rank = 0
+        if self.rank != root_rank:
+            value.__imul__(0)
+        self.handle.byteps_push_pull(value, version=0, priority=priority,
+                                     name=str(key), is_average=False)
+        # Make sure tensors pushed to MXNet engine get processed such that all
+        # workers are synced before starting training.
+        value.wait_to_read()
+
+        out = out if isinstance(out, list) else [out]
+        for o in out:
+            value.copyto(o)
+
+    def pushpull(self, key, value, out=None, priority=0):
+        """ Performs push and pull a single value from the store.
+        This function is coalesced form of push and pull operations.
+        `value` is pushed to the kvstore server for the specified keys and the aggregated
+        values are pulled from the server to `out`. If `out` is not specified the pulled
+        values are written to `value`.
+        Parameters
+        ----------
+        key : str, or int
+            The key.
+        value : NDArray, or list of NDArray
+            Values corresponding to the key.
+        out: NDArray, or list of NDArray
+            Values corresponding to the key.
+        priority : int, optional
+            The priority of the operation.
+            Higher priority operations are likely to be executed before other actions.
+        Examples
+        --------
+        >>> # pushpull a single key-value pair
+        >>> kv.pushpull('3', mx.nd.ones(shape)*8, out=a)
+        >>> print a.asnumpy()
+        [[ 8.  8.  8.]
+        [ 8.  8.  8.]]
+        """
+        # the most common operation operates on one NDArray as `value`, and
+        # `out` is set to None, for inplace pushpull.
+
+        assert isinstance(key, (str, int))
+
+        # unpack the list if it contains just one NDArray
+        value = value[0] if isinstance(
+            value, list) and len(value) == 1 else value
+        assert isinstance(
+            value, NDArray), "The type of value can only be NDArray or list of NDArray which has only one element."
+
+        self.handle.byteps_push_pull(value, version=0, priority=priority,
+                                     name=str(key), is_average=False)
+
+        if out is not None:
+            out = out if isinstance(out, list) else [out]
+            for o in out:
+                value.copyto(o)
+
+    @staticmethod
+    def is_capable(capability):
+        """Queries if the KVStore type supports certain capability, such as optimizer algorithm,
+        gradient compression, sparsity, etc.
+        As byteps server does not store weight, this function will return false for any capabilities.
+
+        Parameters
+        ----------
+        capability: str
+            The capability to query
+        Returns
+        -------
+        result : bool
+            Whether the capability is supported or not.
+        """
+        return False
+
+    @property
+    def type(self):
+        """ Returns the type of this kvstore.
+
+        Returns
+        -------
+        type : str
+            the string type
+        """
+        return 'byteps'
+
+    @property
+    def local_rank(self):
+        """ Returns the local rank of this worker on the node.
+
+        Returns
+        -------
+        rank : int
+            The local rank of this node, which is in range [0, num_workers_on_current_node())
+        """
+        return self.handle.local_rank()
+
+    @property
+    def rank(self):
+        """ Returns the rank of this worker node.
+
+        Returns
+        -------
+        rank : int
+            The rank of this node, which is in range [0, num_workers())
+        """
+        return self.handle.rank()
+
+    @property
+    def num_workers(self):
+        """Returns the number of worker nodes.
+
+        Returns
+        -------
+        size :int
+            The number of worker nodes.
+        """
+        return self.handle.size()
+
+    def set_optimizer(self, optimizer):
+        """
+        Not Implement yet.
+        """
+        raise NotImplementedError()
+
+    def save_optimizer_states(self, fname, dump_optimizer=False):
+        """
+        Not Implement yet.
+        """
+        raise NotImplementedError()
+
+    def load_optimizer_states(self, fname):
+        """
+        Not Implement yet.
+        """
+        raise NotImplementedError()
diff --git a/tests/nightly/dist_device_sync_kvstore_byteps.py b/tests/nightly/dist_device_sync_kvstore_byteps.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import numpy as np
+import numpy.random as rnd
+import time
+import argparse
+from mxnet.log import get_logger
+import logging
+from mxnet.kvstore import BytePS
+logger = get_logger("Byteps-Backend-Test", level=logging.DEBUG)
+
+# parser
+parser = argparse.ArgumentParser(description='kvstore test')
+parser.add_argument('--name', type=str, default='byteps')
+args = parser.parse_args()
+
+def check_diff_to_scalar(A, x, rank=None):
+    """ assert A == x"""
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
+
+# setup
+keys = ['3', '5', '7']
+init_test_keys = [str(i) for i in range(200,300)]
+init_test_keys_big = [str(i) for i in range(300,400)]
+init_test_keys_device = [str(i) for i in range(400,500)]
+init_test_keys_device_big = [str(i) for i in range(500,600)]
+
+shape = (2, 3)
+big_shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
+
+kv = mx.kv.create(args.name)
+my_rank = kv.rank
+my_num_workers = kv.num_workers
+
+has_gpu = mx.context.num_gpus() > 0
+
+def current_context(device=False):
+    if has_gpu and device==True:
+        return mx.gpu(kv.local_rank)
+    else:
+        return mx.current_context()
+
+def test_pushpull():
+    num_gpus = 2
+    def check_default_keys(nrepeat=3):
+        # init kv dns keys
+        kv.broadcast('3', mx.nd.ones(shape, ctx=current_context(device=True)), mx.nd.ones(shape, ctx=current_context(device=True)))
+        kv.broadcast('99', mx.nd.ones(big_shape, ctx=current_context(device=True)), mx.nd.ones(big_shape, ctx=current_context(device=True)))
+        for i in range(nrepeat):
+            scale = my_rank + 1
+            num = (my_num_workers + 1) * my_num_workers * num_gpus / 2
+
+            arr = mx.nd.ones(shape, ctx=current_context(device=True)) * scale
+            # inplace
+            kv.pushpull('3', arr)
+            check_diff_to_scalar(arr, num)
+
+            big_arr = mx.nd.ones(big_shape, ctx=current_context(device=True)) * scale
+            # inplace
+            kv.pushpull('99', big_arr)
+            check_diff_to_scalar(big_arr, num)
+
+    check_default_keys(nrepeat=3)
+    logger.debug('worker ' + str(my_rank) + ' is done')
+
+def test_broadcast():
+    def check_broadcast(kv, cur_keys, cur_shape, device=False):
+        logger.debug("check_broadcast: {}, {}, {}, {}".format(kv, cur_keys, cur_shape, device))
+        ctx = current_context(device=device)
+        val = [mx.nd.zeros(cur_shape, ctx) for i in cur_keys]
+        for i in range(len(cur_keys)):
+            expected = i
+            tmpNDarray = [mx.nd.ones(cur_shape, ctx) * i]
+            kv.broadcast(cur_keys[i], tmpNDarray, out=val[i])
+            check_diff_to_scalar(val[i], expected, my_rank)
+        logger.debug("check_broadcast passed: ", val)
+    #check_broadcast(kv, init_test_keys, shape) #Byteps doesn't support pure CPU training
+    #check_broadcast(kv, init_test_keys_big, big_shape) #Byteps doesn't support pure CPU training
+    check_broadcast(kv, init_test_keys_device, shape, device=True)
+    check_broadcast(kv, init_test_keys_device_big, big_shape, device=True)
+    logger.debug('worker ' + str(my_rank) + ' is initialized')
+
+def test_type():
+    assert kv.type == args.name
+
+if __name__ == "__main__":
+    logger.debug("Type Test Begin")
+    test_type()
+    logger.debug("Type Test Passed")
+    logger.debug("Broadcast Test Begin")
+    test_broadcast()
+    logger.debug("Broadcast Test Passed")
+    logger.debug("PushPull Test Begin")
+    test_pushpull()
+    logger.debug("PushPull Test Passed")