diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000..37439e085
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,33 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. 
+2. 
+3. 
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Environment (please complete the following information):**
+ - OS:
+ - GCC version:
+ - CUDA and NCCL version:
+ - Framework (TF, PyTorch, MXNet):
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..bbcbbe7d6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/README.md b/README.md
index 897bcd6dd..1ef5480dc 100644
--- a/README.md
+++ b/README.md
@@ -47,31 +47,15 @@ python setup.py install
 ```
 Note: you may set `BYTEPS_USE_RDMA=1` to install with RDMA support.
 
-Now you can try our [examples](example). Let's say you are using MXNet and want to try a Resnet50 training benchmark:
+We provide a [step-by-step tutorial](docs/step-by-step-tutorials.md) for you to run benchmark training tasks.
 
-```
-export NVIDIA_VISIBLE_DEVICES=0,1 \
-       DMLC_NUM_WORKER=1 \
-       DMLC_NUM_SERVER=1 \
-       DMLC_WORKER_ID=0 \
-       DMLC_ROLE=worker \
-       DMLC_PS_ROOT_URI=10.0.0.1 \
-       DMLC_PS_ROOT_PORT=1234 \
-       DMLC_INTERFACE=eth0
-
-python byteps/launcher/launch.py byteps/example/mxnet/train_imagenet_byteps.py --benchmark 1 --batch-size=32
-```
-
-For distributed training, you also need to build a server image. We provide [Dockerfiles](docker) as examples.
-You may use the same images for the scheduler and the servers.
-
-Refer to [Documentations](docs) for how to launch distributed jobs and more hands-on tutorials.
+Also refer to [Documentations](docs) for how to [launch distributed jobs](docs/running.md) and more [detailed configurations](docs/env.md).
 
 ## Use BytePS in Your Code
 
 Though being totally different at its core, BytePS is highly compatible with Horovod interfaces (Thank you, Horovod community!). We chose Horovod interfaces in order to minimize your efforts for testing BytePS.
 
-If your tasks only rely on Horovod's allreduce and broadcast, you should be able to switch to BytePS in 1 minute. Simply replace `import horovod.tensorflow as hvd` by `import byteps.tensorflow as bps`, and then replace all `hvd` in your code by `bps`.
+If your tasks only rely on Horovod's allreduce and broadcast, you should be able to switch to BytePS in 1 minute. Simply replace `import horovod.tensorflow as hvd` by `import byteps.tensorflow as bps`, and then replace all `hvd` in your code by `bps`. If your code invokes `hvd.allreduce` directly, you should also replace it by `bps.push_pull`.
 
 Many of our examples were copied from Horovod and modified in this way. For instance, compare the MNIST example for [BytePS](https://github.com/bytedance/byteps/blob/master/example/tensorflow/tensorflow_mnist.py) and [Horovod](https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py).
 
diff --git a/byteps/mxnet/tensor_util.cc b/byteps/mxnet/tensor_util.cc
index 852fb7bbe..5d760d537 100644
--- a/byteps/mxnet/tensor_util.cc
+++ b/byteps/mxnet/tensor_util.cc
@@ -134,15 +134,6 @@ NDArray* TensorUtil::New(int device, int dtype) {
 
 void TensorUtil::Free(NDArray* tensor) { delete tensor; }
 
-// Resize tensor to nDimension with length size[i] in dimension i
-void TensorUtil::ResizeNd(NDArray* tensor, int nDimension, int64_t* size) {
-  TShape mx_shape(nDimension);
-  for (int idx = 0; idx < nDimension; ++idx) {
-    mx_shape[idx] = size[idx];
-  }
-  tensor->Reshape(mx_shape);
-}
-
 // Copy from tensor to output
 void TensorUtil::Copy(NDArray* output, NDArray* tensor) {
   if (tensor->shape() != output->shape())
diff --git a/byteps/mxnet/tensor_util.h b/byteps/mxnet/tensor_util.h
index c64704444..03401d32a 100644
--- a/byteps/mxnet/tensor_util.h
+++ b/byteps/mxnet/tensor_util.h
@@ -41,7 +41,6 @@ class TensorUtil {
 
   static NDArray* New(int device, int dtype);
   static void Free(NDArray* tensor);
-  static void ResizeNd(NDArray* tensor, int nDimension, int64_t* size);
   static void Copy(NDArray* output, NDArray* tensor);
   static void DivideTensorInPlace(NDArray* tensor, int value);
 
diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server
index 52689b673..79d8e8d9c 100644
--- a/docker/Dockerfile.server
+++ b/docker/Dockerfile.server
@@ -22,16 +22,16 @@ RUN rm -f /tmp/pip.conf &&\
 
 RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi
 
-ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
-ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
-ENV PIP_SOURCE_PROXY https://mirrors.aliyun.com/pypi/simple/
+ENV LD_LIBRARY_PATH /root/incubator-mxnet/lib/:/usr/local/lib:$LD_LIBRARY_PATH
 
 ENV SERVER_BUILD_OPTS "USE_BLAS=openblas USE_MKL=1 USE_DIST_KVSTORE=1"
 ENV BYTEPS_SERVER_MXNET_PATH /root/incubator-mxnet
 ENV MXNET_SERVER_LINK https://github.com/bytedance/incubator-mxnet
 
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
+
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -72,4 +72,13 @@ WORKDIR /root/
 RUN git clone --single-branch --branch byteps --recurse-submodules $MXNET_SERVER_LINK
 
 RUN cd $BYTEPS_SERVER_MXNET_PATH && \
-    make clean_all && make -j16 $SERVER_BUILD_OPTS
\ No newline at end of file
+    make clean_all && make -j16 $SERVER_BUILD_OPTS
+
+RUN cd $BYTEPS_SERVER_MXNET_PATH && \
+    cd python && \
+    python setup.py build && \
+    python setup.py install
+
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recurse-submodules $BYTEPS_GIT_LINK
+
diff --git a/docker/Dockerfile.worker.mxnet b/docker/Dockerfile.worker.mxnet
new file mode 100644
index 000000000..e01dd2c16
--- /dev/null
+++ b/docker/Dockerfile.worker.mxnet
@@ -0,0 +1,137 @@
+# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
+ARG REGION
+
+RUN rm -f /tmp/pip.conf &&\
+    echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf
+
+RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi
+
+ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
+ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
+
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
+
+ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
+
+RUN apt-get update &&\
+    apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        git \
+        curl \
+        wget \
+        vim \
+        libopenblas-dev \
+        liblapack-dev \
+        libopencv-dev \
+        python \
+        python-dev \
+        python-setuptools \
+        libjemalloc-dev \
+        graphviz \
+        cmake \
+        libjpeg-dev \
+        libpng-dev \
+        iftop \
+        lsb-release \
+        libcudnn7=${CUDNN_VERSION} \
+	      libnuma-dev \
+        gcc-4.9 \
+        g++-4.9 \
+        gcc-4.9-base
+
+RUN apt-get update &&\
+    apt-get -y install python-pip &&\
+    pip install --upgrade pip
+
+RUN pip --no-cache-dir install \
+        matplotlib \
+        numpy==1.15.2 \
+        scipy \
+        sklearn \
+        pandas \
+        graphviz==0.9.0 \
+        mxboard \
+        tensorboard==1.0.0a6
+
+# Install NCCL
+ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881
+
+RUN cd / && \
+    wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \
+    cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \
+    mkdir -p /usr/local/nccl && \
+    tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
+    echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig && rm -rf /nccl-$NCCL_VERSION
+
+
+WORKDIR /root/
+
+RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig
+
+RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \
+    ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\
+    ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \
+    ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
+
+
+################################ install your framework ################################
+# install mxnet
+ENV MXNET_VERSION 1.4.1
+RUN pip --no-cache-dir install mxnet-cu90==$MXNET_VERSION
+
+################################ install your framework ################################
+
+
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recurse-submodules $BYTEPS_GIT_LINK
+
+# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
+RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
+
+
+# Install BytePS
+RUN cd $BYTEPS_PATH &&\
+    BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install
+
+# Remove GCC pinning
+RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove g++ /usr/bin/g++-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
+
+RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \
+    rm -rf /usr/local/cuda/lib64/libcuda.so.1
+
+
diff --git a/docker/Dockerfile.worker.all_in_one b/docker/Dockerfile.worker.pytorch
similarity index 59%
rename from docker/Dockerfile.worker.all_in_one
rename to docker/Dockerfile.worker.pytorch
index ee7fed2a0..1aa8e5c8e 100644
--- a/docker/Dockerfile.worker.all_in_one
+++ b/docker/Dockerfile.worker.pytorch
@@ -13,8 +13,7 @@
 # limitations under the License.
 # =============================================================================
 
-FROM nvidia/cuda:10.0-devel-ubuntu16.04
-
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
 ARG REGION
 
 RUN rm -f /tmp/pip.conf &&\
@@ -26,13 +25,15 @@ ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
 ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
 ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
 ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
-ENV PIP_SOURCE_PROXY https://mirrors.aliyun.com/pypi/simple/
 
 ENV BYTEPS_BASE_PATH /usr/local
 ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
 ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
 
-RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
+
+RUN apt-get update &&\
+    apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
         ca-certificates \
         git \
@@ -52,7 +53,11 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
         libpng-dev \
         iftop \
         lsb-release \
-        libnuma-dev 
+        libcudnn7=${CUDNN_VERSION} \
+	      libnuma-dev \
+        gcc-4.9 \
+        g++-4.9 \
+        gcc-4.9-base
 
 RUN apt-get update &&\
     apt-get -y install python-pip &&\
@@ -68,6 +73,18 @@ RUN pip --no-cache-dir install \
         mxboard \
         tensorboard==1.0.0a6
 
+# Install NCCL
+ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881
+
+RUN cd / && \
+    wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \
+    cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \
+    mkdir -p /usr/local/nccl && \
+    tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
+    echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig && rm -rf /nccl-$NCCL_VERSION
+
+
 WORKDIR /root/
 
 RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
@@ -83,15 +100,9 @@ RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \
 
 
 ################################ install your framework ################################
-# We install three frameworks in this example, but you can remove two of them
-
-# install mxnet
-ENV MXNET_VERSION 1.4.1
-RUN pip --no-cache-dir install -i $PIP_SOURCE_PROXY mxnet-cu100==$MXNET_VERSION
-
 # install pytorch
 ENV TORCH_VERSION 1.0.1
-RUN pip --no-cache-dir install -i $PIP_SOURCE_PROXY \
+ RUN pip --no-cache-dir install \
         future \
         numpy \
         pyyaml \
@@ -102,23 +113,34 @@ RUN pip --no-cache-dir install -i $PIP_SOURCE_PROXY \
         torchvision==0.2.2 \
         torch==$TORCH_VERSION
 
-# install tensorflow
-ENV TENSORFLOW_VERSION 1.4.1
-RUN pip --no-cache-dir install -i $PIP_SOURCE_PROXY tensorflow-gpu==$TENSORFLOW_VERSION && \
-    rm -rf /tmp/pip && \
-    rm -rf /root/.cache
-
+################################ install your framework ################################
 
 
-################################ install BytePS ################################
 RUN cd $BYTEPS_BASE_PATH &&\
     git clone --recurse-submodules $BYTEPS_GIT_LINK
 
-# Remember to set BYTEPS_WITHOUT_PYTORCH=1 or BYTEPS_WITHOUT_TENSORFLOW=1 when you don't have them,
-# But you don't need to set BYTEPS_WITHOUT_MXNET even if your framework isn't MXNet
+# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
+RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
+
+
+# Install BytePS
 RUN cd $BYTEPS_PATH &&\
-    python setup.py install
+    BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install
+
+# Remove GCC pinning
+RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove g++ /usr/bin/g++-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
 
 RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \
     rm -rf /usr/local/cuda/lib64/libcuda.so.1
 
+
diff --git a/docker/Dockerfile.worker.tensorflow b/docker/Dockerfile.worker.tensorflow
new file mode 100644
index 000000000..010f36015
--- /dev/null
+++ b/docker/Dockerfile.worker.tensorflow
@@ -0,0 +1,139 @@
+# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
+ARG REGION
+
+RUN rm -f /tmp/pip.conf &&\
+    echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf
+
+RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi
+
+ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64
+ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH
+
+ENV BYTEPS_BASE_PATH /usr/local
+ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps
+ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps
+
+ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
+
+RUN apt-get update &&\
+    apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        git \
+        curl \
+        wget \
+        vim \
+        libopenblas-dev \
+        liblapack-dev \
+        libopencv-dev \
+        python \
+        python-dev \
+        python-setuptools \
+        libjemalloc-dev \
+        graphviz \
+        cmake \
+        libjpeg-dev \
+        libpng-dev \
+        iftop \
+        lsb-release \
+        libcudnn7=${CUDNN_VERSION} \
+	      libnuma-dev \
+        gcc-4.9 \
+        g++-4.9 \
+        gcc-4.9-base
+
+RUN apt-get update &&\
+    apt-get -y install python-pip &&\
+    pip install --upgrade pip
+
+RUN pip --no-cache-dir install \
+        matplotlib \
+        numpy==1.15.2 \
+        scipy \
+        sklearn \
+        pandas \
+        graphviz==0.9.0 \
+        mxboard \
+        tensorboard==1.0.0a6
+
+# Install NCCL
+ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881
+
+RUN cd / && \
+    wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \
+    cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \
+    mkdir -p /usr/local/nccl && \
+    tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
+    echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig && rm -rf /nccl-$NCCL_VERSION
+
+
+WORKDIR /root/
+
+RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
+    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \
+    ldconfig
+
+RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \
+    ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\
+    ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \
+    ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
+
+
+################################ install your framework ################################
+# install tensorflow
+ENV TENSORFLOW_VERSION 1.12.0
+RUN pip --no-cache-dir install tensorflow-gpu==$TENSORFLOW_VERSION && \
+    rm -rf /tmp/pip && \
+    rm -rf /root/.cache
+
+################################ install your framework ################################
+
+
+RUN cd $BYTEPS_BASE_PATH &&\
+    git clone --recurse-submodules $BYTEPS_GIT_LINK
+
+# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
+RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
+    update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
+    update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
+
+
+# Install BytePS
+RUN cd $BYTEPS_PATH &&\
+    BYTEPS_WITHOUT_PYTORCH=1 python setup.py install
+
+# Remove GCC pinning
+RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
+    update-alternatives --remove g++ /usr/bin/g++-4.9 && \
+    update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
+
+RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \
+    rm -rf /usr/local/cuda/lib64/libcuda.so.1
+
+
diff --git a/docs/env.md b/docs/env.md
index e0bd273fc..c86c64435 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -1,5 +1,9 @@
 # BytePS Environment Variables
 
+Regardless of your framework, TensorFlow, PyTorch or MXNet, you must set the required envrionment variables below, including DMLC_* variables. This is because we leverage the [DMLC/MXNet bootstrapping process](https://mxnet.incubator.apache.org/versions/master/faq/distributed_training.html#manually-launching-jobs). 
+
+To run distributed training, you must start one scheduler, at least one server, and at least two workers. If you only have one worker, you won't need scheduler or server.
+
 ## Required for workers
 
 For each worker machine, you must specify the following variables:
@@ -12,9 +16,11 @@ export DMLC_WORKER_ID=x
 export DMLC_NUM_WORKER=y
 ```
 
-If you have ```NVIDIA_VISIBLE_DEVICES``` set, you can run ```launcher/launcher.py YOUR_COMMAND``` to start your job. 
+`DMLC_PS_ROOT_URI` is the IP of your scheduler. `DMLC_PS_ROOT_PORT` is the port that your scheduler binds to.
+
+If you have `NVIDIA_VISIBLE_DEVICES` set, you can run `launcher/launcher.py YOUR_COMMAND` to start your job. 
 
-Alternatively, you can start the job on each GPU after specifying:
+Alternatively, if you don't use `launcher/launcher.py`, you can start the job on each GPU after specifying:
 
 ```
 export BYTEPS_LOCAL_RANK=r
@@ -36,7 +42,7 @@ https://mxnet.incubator.apache.org/versions/master/faq/distributed_training.html
 
 In short, you should configure the same DMLC_* variables as the worker, except that DMLC_ROLE should be either server or scheduler.
 
-Also, set DMLC_ENABLE_RDMA accordingly as workers.
+Also, set DMLC_ENABLE_RDMA if you have RDMA network. This must be consistent with workers.
 
 ## BytePS debug
 
diff --git a/docs/rationale.md b/docs/rationale.md
index bf213820a..fd447b184 100644
--- a/docs/rationale.md
+++ b/docs/rationale.md
@@ -1,10 +1,10 @@
 # The Rationale of BytePS
 
-We find that some users may not fully understand this page. If you have doubts after reading this page, we prepare a detailed [FAQ](/docs/faq.md). If you still have questions, you are welcome to Github raise issues.
+We find that some users may not fully understand this page. If you have doubts after reading this page, we prepare a detailed [FAQ](/docs/faq.md). If you still have questions, you are welcome to raise Github issues.
 
 ## Background
 
-You want to run your training on your expensive GPU cluster. Unfortunately, when you run distributed training, these GPUs are not well fed -- their precious cycles are wasted on waiting for network transmission. You tried many existing solutions, most notably the popular allreduce approach. You probably found NCCL gave you better performance than many other alternatives. You know NCCL is developed by NVIDIA and HPC experts. You guess that may be it.
+You want to run your training in your expensive GPU cluster. Unfortunately, when you run distributed training, these GPUs are not well fed -- their precious cycles are wasted on waiting for network transmission. You tried many existing solutions, most notably the popular allreduce approach. You probably found NCCL gave you better performance than many other alternatives. You know NCCL is developed by NVIDIA and HPC experts. You guess that may be it.
 
 We understand you. This is why we develop BytePS and want to show you, in a cloud or in-house shared cluster environment, NCCL (and fundamentally allreduce) is suboptimal.
 
@@ -35,7 +35,7 @@ In light of this, BytePS is specifically designed to run PS instances using only
 GPUs are extremely expensive compared with CPUs and network bandwidth. Use AWS public price sheet as an example. If you rent 4x [p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/), it would cost you nearly $100 per hour. However, to match the network bandwidth, you can rent 4x or even 8x [c5n.xlarge](https://aws.amazon.com/ec2/pricing/on-demand/) as your PS for $0.2 per instance per hour! 
 c5n.xlarge has 4 CPU cores that are sufficient for BytePS and its [up to 25Gbps](https://aws.amazon.com/ec2/instance-types/) network.
 
-Therefore, on a public cloud, you just need 2% more spending, you may get up to 100% improvement because your bottleneck bandwidth it utilized twice as efficient. If you manage your own training cluster, you may not really have any additional spending, because you probably have spare CPU and networking resources somewhere in your data center.
+Therefore, on a public cloud, you just need 2% more spending, you may get up to 100% improvement because your bottleneck bandwidth is utilized twice as efficient. If you manage your own training cluster, you may not really have any additional spending, because you probably have spare CPU and networking resources somewhere in your data center.
 
 ## I still don't understand. NCCL beats TF (or MXNet) PS so hard..
 
diff --git a/docs/running.md b/docs/running.md
index 9cba7ea98..80d07bcac 100644
--- a/docs/running.md
+++ b/docs/running.md
@@ -1,6 +1,6 @@
 # Running BytePS
 
-BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes.
+BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes. **Below instructions, including those DMLC variables, apply to all frameworks.**
 
 Let's say you have two worker machines (or docker containers) that have GPUs, one machine or container as a server, and a scheduler. The scheduler binds on 10.0.0.1 and port 9000. The workers and the server can connect to the scheduler via the IP and port using TCP.
 
@@ -22,18 +22,26 @@ DMLC_WORKER_ID=1 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \
 python launcher/launcher.py YOUR_COMMAND
 ```
 
-On the server, run (remove DMLC_WORKER_ID, and set role to server):
+**For servers and schedulers, we highly recommend you use the docker image we build:**
+
+```
+docker pull bytepsimage/byteps_server
+```
+
+Start server and scheduler docker instances with this image. In the server, run the following. Compared with the worker command, we remove DMLC_WORKER_ID, and set role to server.
 
 ```
 DMLC_ROLE=server DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 python launcher/launcher.py
 ```
 
-On the scheduler, run (remove DMLC_WORKER_ID, and set role to scheduler):
+On the scheduler, run (we also remove DMLC_WORKER_ID, and set role to scheduler):
 
 ```
 DMLC_ROLE=scheduler DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 python launcher/launcher.py
 ```
 
-The order of above commands does not matter.
+In this example, your scheduler must be able to bind to `10.0.0.1:9000`.
+
+The order of starting workers/servers/scheduler does not matter.
diff --git a/docs/step-by-step-tutorials.md b/docs/step-by-step-tutorials.md
new file mode 100644
index 000000000..d9e3da664
--- /dev/null
+++ b/docs/step-by-step-tutorials.md
@@ -0,0 +1,179 @@
+# A Step-by-Step Tutorial 
+
+The goal of this tutorial is to help you run BytePS quickly. To ensure that you don't get trouble with system environments problem, we recommend you to use our provided images (as the first step).
+
+ 
+## Single Machine Training 
+
+### TensorFlow
+```
+docker pull bytepsimage/worker_tensorflow
+
+nvidia-docker run --shm-size=32768m -it bytepsimage/worker_tensorflow bash
+
+# now you are in docker environment
+export NVIDIA_VISIBLE_DEVICES=0,1,2,3  # say you have 4 GPUs 
+export DMLC_WORKER_ID=0 # your worker id
+export DMLC_NUM_WORKER=1 # you only have one worker
+export DMLC_ROLE=worker # your role is worker
+
+# the following value does not matter for non-distributed jobs 
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 
+export DMLC_PS_ROOT_PORT=1234 
+
+# can also try: export EVAL_TYPE=mnist 
+export EVAL_TYPE=benchmark 
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \
+       --model ResNet50 --num-iters 1000        
+```
+
+### PyTorch
+
+
+```
+docker pull bytepsimage/worker_pytorch
+
+nvidia-docker run --shm-size=32768m -it bytepsimage/worker_pytorch bash
+
+# now you are in docker environment
+export NVIDIA_VISIBLE_DEVICES=0,1,2,3  # say you have 4 GPUs 
+export DMLC_WORKER_ID=0 # your worker id
+export DMLC_NUM_WORKER=1 # you only have one worker
+export DMLC_ROLE=worker # your role is worker
+
+# the following value does not matter for non-distributed jobs 
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 
+export DMLC_PS_ROOT_PORT=1234 
+
+export EVAL_TYPE=benchmark 
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \
+       --model resnet50 --num-iters 1000      
+```
+
+### MXNet
+
+```
+docker pull bytepsimage/worker_mxnet
+
+nvidia-docker run --shm-size=32768m -it bytepsimage/worker_mxnet bash
+
+# now you are in docker environment
+export NVIDIA_VISIBLE_DEVICES=0,1,2,3  # say you have 4 GPUs 
+export DMLC_WORKER_ID=0 # your worker id
+export DMLC_NUM_WORKER=1 # you only have one worker
+export DMLC_ROLE=worker # your role is worker
+
+# the following value does not matter for non-distributed jobs 
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 
+export DMLC_PS_ROOT_PORT=1234 
+
+export EVAL_TYPE=benchmark 
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \
+       --benchmark 1 --batch-size=32  
+```
+
+## Distributed Training 
+
+Let's say you have two workers, and each one with 4 GPUs. For simplicity we use one server.
+
+The way to launch the scheduler and the server are the same for any framework.
+
+For the scheduler:
+```
+# scheduler can use the same image as servers
+docker pull bytepsimage/byteps_server
+
+docker run -it bytepsimage/byteps_server bash
+
+# now you are in docker environment
+export DMLC_NUM_WORKER=2 
+export DMLC_ROLE=scheduler 
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP 
+export DMLC_PS_ROOT_PORT=1234  # the scheduler port
+
+python /usr/local/byteps/launcher/launch.py
+```
+
+For the server:
+```
+docker pull bytepsimage/byteps_server
+
+docker run -it bytepsimage/byteps_server bash
+
+# now you are in docker environment
+export DMLC_NUM_WORKER=2 
+export DMLC_ROLE=server  
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP 
+export DMLC_PS_ROOT_PORT=1234  # the scheduler port
+
+python /usr/local/byteps/launcher/launch.py
+```
+
+For the workers, you need to pay attention to `DMLC_WORKER_ID`. This is the main difference compared to single machine jobs. Let's say the 2 workers are using MXNet.
+
+For worker-0:
+```
+docker pull bytepsimage/worker_mxnet
+
+nvidia-docker run --shm-size=32768m -it bytepsimage/worker_mxnet bash
+
+# now you are in docker environment
+export NVIDIA_VISIBLE_DEVICES=0,1,2,3  # say you have 4 GPUs 
+export DMLC_WORKER_ID=0 # worker-0
+export DMLC_NUM_WORKER=2 # 2 workers
+export DMLC_ROLE=worker # your role is worker
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP 
+export DMLC_PS_ROOT_PORT=1234 # the scheduler port
+
+export EVAL_TYPE=benchmark 
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \
+       --benchmark 1 --batch-size=32  
+```
+
+For worker-1:
+
+```
+docker pull bytepsimage/worker_mxnet
+
+nvidia-docker run --shm-size=32768m -it bytepsimage/worker_mxnet bash
+
+# now you are in docker environment
+export NVIDIA_VISIBLE_DEVICES=0,1,2,3  # say you have 4 GPUs 
+export DMLC_WORKER_ID=1 # worker-1
+export DMLC_NUM_WORKER=2 # 2 workers
+export DMLC_ROLE=worker # your role is worker
+export DMLC_NUM_SERVER=1 
+export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP 
+export DMLC_PS_ROOT_PORT=1234 # the scheduler port
+
+export EVAL_TYPE=benchmark 
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \
+       --benchmark 1 --batch-size=32  
+```
+
+If your workers use TensorFlow, you need to change the image name to `bytepsimage/worker_tensorflow`, and replace the python script with
+```
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \
+       --model ResNet50 --num-iters 1000     
+```
+
+If your workers use PyTorch, you need to change the image name to `bytepsimage/worker_pytorch`, and replace the python script with
+
+```
+python /usr/local/byteps/launcher/launch.py \
+       /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \
+       --model resnet50 --num-iters 1000   
+```
+
diff --git a/launcher/launch.py b/launcher/launch.py
index 2ea4d72a3..8c909c7c5 100644
--- a/launcher/launch.py
+++ b/launcher/launch.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+from __future__ import print_function
 import os
 import subprocess
 import threading
@@ -18,7 +19,7 @@ def worker(local_rank, local_size, command):
     subprocess.check_call(command, env=my_env, stdout=sys.stdout, stderr=sys.stderr, shell=True)
 
 if __name__ == "__main__":
-    print "BytePS launching " + os.environ["DMLC_ROLE"]
+    print("BytePS launching " + os.environ["DMLC_ROLE"])
     sys.stdout.flush()
 
     if os.environ["DMLC_ROLE"] == "worker":
@@ -38,11 +39,8 @@ def worker(local_rank, local_size, command):
 
     else:
         if "BYTEPS_SERVER_MXNET_PATH" not in os.environ:
-            print "BYTEPS_SERVER_MXNET_PATH env not set"
+            print("BYTEPS_SERVER_MXNET_PATH env not set")
             os._exit(0)
         sys.path.insert(0, os.getenv("BYTEPS_SERVER_MXNET_PATH")+"/python")
         import mxnet
-        print "BytePS Server MXNet version: " + mxnet.__version__
-        # TODO: terminates when workers quit
-        while True:
-            time.sleep(3600)
+