Merge branch 'master' into ci

bytedance · Jun 27, 2019 · 23dd387 · 23dd387
2 parents d930999 + 23b161e
commit 23dd387
Show file tree

Hide file tree

Showing 14 changed files with 598 additions and 73 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,33 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. 
+2. 
+3. 
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Environment (please complete the following information):**
+ - OS:
+ - GCC version:
+ - CUDA and NCCL version:
+ - Framework (TF, PyTorch, MXNet):
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/README.md b/README.md
@@ -47,31 +47,15 @@ python setup.py install
 ```
 Note: you may set `BYTEPS_USE_RDMA=1` to install with RDMA support.
 
-Now you can try our [examples](example). Let's say you are using MXNet and want to try a Resnet50 training benchmark:
+We provide a [step-by-step tutorial](docs/step-by-step-tutorials.md) for you to run benchmark training tasks.
 
-```
-export NVIDIA_VISIBLE_DEVICES=0,1 \
-       DMLC_NUM_WORKER=1 \
-       DMLC_NUM_SERVER=1 \
-       DMLC_WORKER_ID=0 \
-       DMLC_ROLE=worker \
-       DMLC_PS_ROOT_URI=10.0.0.1 \
-       DMLC_PS_ROOT_PORT=1234 \
-       DMLC_INTERFACE=eth0
-
-python byteps/launcher/launch.py byteps/example/mxnet/train_imagenet_byteps.py --benchmark 1 --batch-size=32
-```
-
-For distributed training, you also need to build a server image. We provide [Dockerfiles](docker) as examples.
-You may use the same images for the scheduler and the servers.
-
-Refer to [Documentations](docs) for how to launch distributed jobs and more hands-on tutorials.
+Also refer to [Documentations](docs) for how to [launch distributed jobs](docs/running.md) and more [detailed configurations](docs/env.md).
 
 ## Use BytePS in Your Code
 
 Though being totally different at its core, BytePS is highly compatible with Horovod interfaces (Thank you, Horovod community!). We chose Horovod interfaces in order to minimize your efforts for testing BytePS.
 
-If your tasks only rely on Horovod's allreduce and broadcast, you should be able to switch to BytePS in 1 minute. Simply replace `import horovod.tensorflow as hvd` by `import byteps.tensorflow as bps`, and then replace all `hvd` in your code by `bps`.
+If your tasks only rely on Horovod's allreduce and broadcast, you should be able to switch to BytePS in 1 minute. Simply replace `import horovod.tensorflow as hvd` by `import byteps.tensorflow as bps`, and then replace all `hvd` in your code by `bps`. If your code invokes `hvd.allreduce` directly, you should also replace it by `bps.push_pull`.
 
 Many of our examples were copied from Horovod and modified in this way. For instance, compare the MNIST example for [BytePS](https://github.com/bytedance/byteps/blob/master/example/tensorflow/tensorflow_mnist.py) and [Horovod](https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py).
 

diff --git a/byteps/mxnet/tensor_util.cc b/byteps/mxnet/tensor_util.cc
@@ -134,15 +134,6 @@ NDArray* TensorUtil::New(int device, int dtype) {
 
 void TensorUtil::Free(NDArray* tensor) { delete tensor; }
 
-// Resize tensor to nDimension with length size[i] in dimension i
-void TensorUtil::ResizeNd(NDArray* tensor, int nDimension, int64_t* size) {
-  TShape mx_shape(nDimension);
-  for (int idx = 0; idx < nDimension; ++idx) {
-    mx_shape[idx] = size[idx];
-  }
-  tensor->Reshape(mx_shape);
-}
-
 // Copy from tensor to output
 void TensorUtil::Copy(NDArray* output, NDArray* tensor) {
   if (tensor->shape() != output->shape())

diff --git a/byteps/mxnet/tensor_util.h b/byteps/mxnet/tensor_util.h
@@ -41,7 +41,6 @@ class TensorUtil {
 
   static NDArray* New(int device, int dtype);
   static void Free(NDArray* tensor);
-  static void ResizeNd(NDArray* tensor, int nDimension, int64_t* size);
   static void Copy(NDArray* output, NDArray* tensor);
   static void DivideTensorInPlace(NDArray* tensor, int value);
 

diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server
@@ -22,16 +22,16 @@ RUN rm -f /tmp/pip.conf &&\
 
 RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi
 
-ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
-ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
-ENV PIP_SOURCE_PROXY https://mirrors.aliyun.com/pypi/simple/
+ENV LD_LIBRARY_PATH /root/incubator-mxnet/lib/:/usr/local/lib:$LD_LIBRARY_PATH
 
 ENV SERVER_BUILD_OPTS "USE_BLAS=openblas USE_MKL=1 USE_DIST_KVSTORE=1"
 ENV BYTEPS_SERVER_MXNET_PATH /root/incubator-mxnet
 ENV MXNET_SERVER_LINK https://github.com/bytedance/incubator-mxnet
 
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
+
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -72,4 +72,13 @@ WORKDIR /root/
 RUN git clone --single-branch --branch byteps --recurse-submodules $MXNET_SERVER_LINK
 
 RUN cd $BYTEPS_SERVER_MXNET_PATH && \
-    make clean_all && make -j16 $SERVER_BUILD_OPTS
+    make clean_all && make -j16 $SERVER_BUILD_OPTS
+
+RUN cd $BYTEPS_SERVER_MXNET_PATH && \
+    cd python && \
+    python setup.py build && \
+    python setup.py install
+
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recurse-submodules $BYTEPS_GIT_LINK
+
diff --git a/docker/Dockerfile.worker.mxnet b/docker/Dockerfile.worker.mxnet
@@ -0,0 +1,137 @@
+# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
+ARG REGION
+
+RUN rm -f /tmp/pip.conf &&\
+    echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf
+
+RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi
+
+ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
+ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
+
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
+
+ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
+
+RUN apt-get update &&\
+    apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        git \
+        curl \
+        wget \
+        vim \
+        libopenblas-dev \
+        liblapack-dev \
+        libopencv-dev \
+        python \
+        python-dev \
+        python-setuptools \
+        libjemalloc-dev \
+        graphviz \
+        cmake \
+        libjpeg-dev \
+        libpng-dev \
+        iftop \
+        lsb-release \
+        libcudnn7=${CUDNN_VERSION} \
+	      libnuma-dev \
+        gcc-4.9 \
+        g++-4.9 \
+        gcc-4.9-base
+
+RUN apt-get update &&\
+    apt-get -y install python-pip &&\
+    pip install --upgrade pip
+
+RUN pip --no-cache-dir install \
+        matplotlib \
+        numpy==1.15.2 \
+        scipy \
+        sklearn \
+        pandas \
+        graphviz==0.9.0 \
+        mxboard \
+        tensorboard==1.0.0a6
+
+# Install NCCL
+ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881
+
+RUN cd / && \
+    wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \
+    cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \
+    mkdir -p /usr/local/nccl && \
+    tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
+    echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig && rm -rf /nccl-$NCCL_VERSION
+
+
+WORKDIR /root/
+
+RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig
+
+RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \
+    ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\
+    ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \
+    ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
+
+
+################################ install your framework ################################
+# install mxnet
+ENV MXNET_VERSION 1.4.1
+RUN pip --no-cache-dir install mxnet-cu90==$MXNET_VERSION
+
+################################ install your framework ################################
+
+
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recurse-submodules $BYTEPS_GIT_LINK
+
+# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
+RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
+
+
+# Install BytePS
+RUN cd $BYTEPS_PATH &&\
+    BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install
+
+# Remove GCC pinning
+RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove g++ /usr/bin/g++-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
+
+RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \
+    rm -rf /usr/local/cuda/lib64/libcuda.so.1
+
+