horovod · eric-haibin-lin · Jul 23, 2020 · Aug 3, 2020 · Aug 4, 2020 · Aug 4, 2020
diff --git a/.buildkite/gen-pipeline.sh b/.buildkite/gen-pipeline.sh
@@ -151,9 +151,15 @@ run_mpi_integration() {
       "bash -c \"${oneccl_env} \\\$(cat /mpirun_command) python /horovod/examples/pytorch_mnist.py\""
   fi
 
-  run_test "${test}" "${queue}" \
-    ":muscle: Test MXNet MNIST (${test})" \
-    "bash -c \"${oneccl_env} OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py\""
+  if [[ ${test} == *"mxnet2_"* ]] || [[ ${test} == *"mxnethead"* ]]; then
+    run_test "${test}" "${queue}" \
+      ":muscle: Test MXNet2 MNIST (${test})" \
+      "bash -c \"${oneccl_env} OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet2_mnist.py\""
+  else
+    run_test "${test}" "${queue}" \
+      ":muscle: Test MXNet MNIST (${test})" \
+      "bash -c \"${oneccl_env} OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py\""
+  fi
 
   # tests that should be executed only with the latest release since they don't test
   # a framework-specific functionality
@@ -249,9 +255,15 @@ run_gloo_integration() {
     ":fire: Test PyTorch MNIST (${test})" \
     "horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch_mnist.py"
 
-  run_test "${test}" "${queue}" \
-    ":muscle: Test MXNet MNIST (${test})" \
-    "horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet_mnist.py"
+  if [[ ${test} == *"mxnet2_"* ]] || [[ ${test} == *"mxnethead"* ]]; then
+    run_test "${test}" "${queue}" \
+      ":muscle: Test MXNet2 MNIST (${test})" \
+      "horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet2_mnist.py"
+  else
+    run_test "${test}" "${queue}" \
+      ":muscle: Test MXNet MNIST (${test})" \
+      "horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet_mnist.py"
+  fi
 
   # Elastic
   local elastic_tensorflow="test_elastic_tensorflow.py"
@@ -334,9 +346,15 @@ run_single_integration() {
     ":fire: Single PyTorch MNIST (${test})" \
     "bash -c \"${oneccl_env} python /horovod/examples/pytorch_mnist.py --epochs 3\""
 
-  run_test "${test}" "${queue}" \
-    ":muscle: Single MXNet MNIST (${test})" \
-    "bash -c \"${oneccl_env} python /horovod/examples/mxnet_mnist.py --epochs 3\""
+  if [[ ${test} == *"mxnet2_"* ]] || [[ ${test} == *"mxnethead"* ]]; then
+    run_test "${test}" "${queue}" \
+      ":muscle: Single MXNet2 MNIST (${test})" \
+      "bash -c \"${oneccl_env} python /horovod/examples/mxnet2_mnist.py --epochs 3\""
+  else
+    run_test "${test}" "${queue}" \
+      ":muscle: Single MXNet MNIST (${test})" \
+      "bash -c \"${oneccl_env} python /horovod/examples/mxnet_mnist.py --epochs 3\""
+  fi
 }
 
 build_docs() {

diff --git a/Dockerfile.test.cpu b/Dockerfile.test.cpu
@@ -139,7 +139,7 @@ RUN pip install "Pillow<7.0" --no-deps
 
 # Install MXNet.
 RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly" ]]; then \
-        pip install --pre mxnet-mkl -f https://dist.mxnet.io/python/all; \
+        pip install --pre mxnet -f https://dist.mxnet.io/python/all; \
     else \
         pip install ${MXNET_PACKAGE} ; \
     fi

diff --git a/Dockerfile.test.gpu b/Dockerfile.test.gpu
@@ -116,7 +116,7 @@ RUN pip install "Pillow<7.0" --no-deps
 
 # Install MXNet.
 RUN if [[ ${MXNET_PACKAGE} == "mxnet-nightly" ]]; then \
-        pip install --pre mxnet-cu101mkl -f https://dist.mxnet.io/python/all; \
+        pip install --pre mxnet-cu101 -f https://dist.mxnet.io/python/all; \
     else \
         pip install ${MXNET_PACKAGE} ; \
     fi

diff --git a/examples/mxnet2_mnist.py b/examples/mxnet2_mnist.py
@@ -0,0 +1,171 @@
+import argparse
+import logging
+import os
+import zipfile
+import time
+
+import mxnet as mx
+import horovod.mxnet as hvd
+from mxnet import autograd, gluon, nd
+from mxnet.test_utils import download
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet MNIST Example')
+
+parser.add_argument('--batch-size', type=int, default=64,
+                    help='training batch size (default: 64)')
+parser.add_argument('--dtype', type=str, default='float32',
+                    help='training data type (default: float32)')
+parser.add_argument('--epochs', type=int, default=5,
+                    help='number of training epochs (default: 5)')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='SGD momentum (default: 0.9)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disable training on GPU (default: False)')
+args = parser.parse_args()
+
+if not args.no_cuda:
+    # Disable CUDA if there are no GPUs.
+    if not mx.test_utils.list_gpus():
+        args.no_cuda = True
+
+logging.basicConfig(level=logging.INFO)
+logging.info(args)
+
+
+# Function to get mnist iterator given a rank
+def get_mnist_iterator(rank):
+    data_dir = "data-%d" % rank
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
+                             dirname=data_dir)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(data_dir)
+
+    input_shape = (1, 28, 28)
+    batch_size = args.batch_size
+
+    train_iter = mx.io.MNISTIter(
+        image="%s/train-images-idx3-ubyte" % data_dir,
+        label="%s/train-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=False,
+        num_parts=hvd.size(),
+        part_index=hvd.rank()
+    )
+
+    val_iter = mx.io.MNISTIter(
+        image="%s/t10k-images-idx3-ubyte" % data_dir,
+        label="%s/t10k-labels-idx1-ubyte" % data_dir,
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=False,
+    )
+
+    return train_iter, val_iter
+
+
+# Function to define neural network
+def conv_nets():
+    net = gluon.nn.HybridSequential()
+    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    net.add(gluon.nn.Flatten())
+    net.add(gluon.nn.Dense(512, activation="relu"))
+    net.add(gluon.nn.Dense(10))
+    return net
+
+
+# Function to evaluate accuracy for a model
+def evaluate(model, data_iter, context):
+    data_iter.reset()
+    metric = mx.gluon.metric.Accuracy()
+    for _, batch in enumerate(data_iter):
+        data = batch.data[0].as_in_context(context)
+        label = batch.label[0].as_in_context(context)
+        output = model(data.astype(args.dtype, copy=False))
+        metric.update([label], [output])
+
+    return metric.get()
+
+
+# Initialize Horovod
+hvd.init()
+
+# Horovod: pin context to local rank
+context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
+num_workers = hvd.size()
+
+# Load training and validation data
+train_data, val_data = get_mnist_iterator(hvd.rank())
+
+# Build model
+model = conv_nets()
+model.cast(args.dtype)
+model.hybridize()
+
+# Create optimizer
+optimizer_params = {'momentum': args.momentum,
+                    'learning_rate': args.lr * hvd.size()}
+opt = mx.optimizer.create('sgd', **optimizer_params)
+
+# Initialize parameters
+initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
+                             magnitude=2)
+model.initialize(initializer, ctx=context)
+
+# Horovod: fetch and broadcast parameters
+params = model.collect_params()
+if params is not None:
+    hvd.broadcast_parameters(params, root_rank=0)
+
+# Horovod: create DistributedTrainer, a subclass of gluon.Trainer
+trainer = hvd.DistributedTrainer(params, opt)
+
+# Create loss function and train metric
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+metric = mx.gluon.metric.Accuracy()
+
+# Train model
+for epoch in range(args.epochs):
+    tic = time.time()
+    train_data.reset()
+    metric.reset()
+    for nbatch, batch in enumerate(train_data, start=1):
+        data = batch.data[0].as_in_context(context)
+        label = batch.label[0].as_in_context(context)
+        with autograd.record():
+            output = model(data.astype(args.dtype, copy=False))
+            loss = loss_fn(output, label)
+        loss.backward()
+        trainer.step(args.batch_size)
+        metric.update([label], [output])
+
+        if nbatch % 100 == 0:
+            name, acc = metric.get()
+            logging.info('[Epoch %d Batch %d] Training: %s=%f' %
+                         (epoch, nbatch, name, acc))
+
+    if hvd.rank() == 0:
+        elapsed = time.time() - tic
+        speed = nbatch * args.batch_size * hvd.size() / elapsed
+        logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f',
+                     epoch, speed, elapsed)
+
+    # Evaluate model accuracy
+    _, train_acc = metric.get()
+    name, val_acc = evaluate(model, val_data, context)
+    if hvd.rank() == 0:
+        logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name,
+                     train_acc, name, val_acc)
+
+    if hvd.rank() == 0 and epoch == args.epochs - 1:
+        assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\
+                                (0.96)" % val_acc
diff --git a/horovod/mxnet/__init__.py b/horovod/mxnet/__init__.py
@@ -100,18 +100,21 @@ def __init__(self, params, optimizer, optimizer_params=None):
     def _allreduce_grads(self):
         if size() == 1: return
 
+        # In MXNet 2.0, param.name is no longer unique.
+        # Meanwhile, since horovod requires Python 3.6, there is no need to sort
+        # self._params as enumerating a python dict is always deterministic.
         for i, param in enumerate(self._params):
             if param.grad_req != 'null':
                 allreduce_(param.list_grad()[0], average=False,
-                           name=param.name, priority=-i)
+                           name=str(i), priority=-i)
 
 
 # Wrapper to inject Horovod broadcast after parameter initialization
-def _append_broadcast_init(param, root_rank):
+def _append_broadcast_init(param, root_rank, name):
     init_impl = getattr(param, '_init_impl')
     def wrapped_init_impl(self, *args, **kwargs):
         init_impl(*args, **kwargs)
-        broadcast_(self.data(), root_rank=root_rank, name=self.name)
+        broadcast_(self.data(), root_rank=root_rank, name=name)
     return wrapped_init_impl
 
 
@@ -132,17 +135,25 @@ def broadcast_parameters(params, root_rank=0):
 
     tensors = []
     names = []
-    if isinstance(params, dict):
-        names, tensors = zip(*params.items())
-    elif isinstance(params, mx.gluon.parameter.ParameterDict):
+    try:
+        from mxnet.gluon.parameter import ParameterDict
+        valid_types = (dict, ParameterDict)
+    except ImportError:
+        valid_types = (dict,)
+    if isinstance(params, valid_types):
         for name, p in sorted(params.items()):
             try:
-                tensors.append(p.data())
+                if isinstance(p, mx.gluon.parameter.Parameter):
+                    tensors.append(p.data())
+                else:
+                    tensors.append(p)
                 names.append(name)
             except mx.gluon.parameter.DeferredInitializationError:
                 # Inject wrapper method with post-initialization broadcast to
                 # handle parameters with deferred initialization
-                new_init = _append_broadcast_init(p, root_rank)
+                # we use the key of params instead of param.name, since
+                # param.name is no longer unique in MXNet 2.0
+                new_init = _append_broadcast_init(p, root_rank, name)
                 p._init_impl = types.MethodType(new_init, p)
     else:
         raise ValueError('invalid params of type: %s' % type(params))

diff --git a/setup.py b/setup.py
@@ -1127,6 +1127,14 @@ def build_mx_extension(build_ext, global_options):
         mxnet_mpi_lib.define_macros += [('MXNET_USE_MKLDNN', '1')]
     else:
         mxnet_mpi_lib.define_macros += [('MXNET_USE_MKLDNN', '0')]
+    cxx11_abi = '1'
+    try:
+        import mxnet as mx
+        if int(mx.library.compiled_with_gcc_cxx11_abi()) == 0:
+            cxx11_abi = '0'
+    except AttributeError:
+        pass
+    mxnet_mpi_lib.define_macros += [('_GLIBCXX_USE_CXX11_ABI', cxx11_abi)]
     mxnet_mpi_lib.define_macros += [('MSHADOW_USE_MKL', '0')]
     mxnet_mpi_lib.define_macros += [('MSHADOW_USE_F16C', '0')]
     mxnet_mpi_lib.include_dirs = options['INCLUDES']

diff --git a/test/data/expected_buildkite_pipeline.yaml b/test/data/expected_buildkite_pipeline.yaml
@@ -1342,8 +1342,8 @@ steps:
     automatic: true
   agents:
     queue: cpu
-- label: ':muscle: Test MXNet MNIST (test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
-  command: bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py"
+- label: ':muscle: Test MXNet2 MNIST (test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
+  command: bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet2_mnist.py"
   plugins:
   - docker-compose#v2.6.0:
       run: test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0
@@ -1412,8 +1412,8 @@ steps:
     automatic: true
   agents:
     queue: cpu
-- label: ':muscle: Single MXNet MNIST (test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
-  command: bash -c " python /horovod/examples/mxnet_mnist.py --epochs 3"
+- label: ':muscle: Single MXNet2 MNIST (test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
+  command: bash -c " python /horovod/examples/mxnet2_mnist.py --epochs 3"
   plugins:
   - docker-compose#v2.6.0:
       run: test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0
@@ -2744,8 +2744,8 @@ steps:
     automatic: true
   agents:
     queue: 2x-gpu-g4
-- label: ':muscle: Test MXNet MNIST (test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
-  command: bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py"
+- label: ':muscle: Test MXNet2 MNIST (test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0)'
+  command: bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet2_mnist.py"
   plugins:
   - docker-compose#v2.6.0:
       run: test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0

diff --git a/test/test_mxnet.py b/test/test_mxnet.py
@@ -847,13 +847,12 @@ class SimpleNet(HybridBlock):
             def __init__(self, layer_num=6, **kwargs):
                 super(SimpleNet, self).__init__(**kwargs)
                 self._layer_num = layer_num
-                with self.name_scope():
-                    self.ln_l = nn.HybridSequential()
-                    self.dense_l = nn.HybridSequential()
-                    for i in range(layer_num):
-                        self.dense_l.add(nn.Dense(units=32 + layer_num - 1 - i,
-                            flatten=False))
-                        self.ln_l.add(nn.LayerNorm())
+                self.ln_l = nn.HybridSequential()
+                self.dense_l = nn.HybridSequential()
+                for i in range(layer_num):
+                    self.dense_l.add(nn.Dense(units=32 + layer_num - 1 - i,
+                        flatten=False))
+                    self.ln_l.add(nn.LayerNorm())
 
             def hybrid_forward(self, F, data):
                 """