From 9d5605aa016d88e15e20eb021dc7f1add308987b Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Fri, 28 Feb 2020 21:42:23 +0800 Subject: [PATCH 01/11] setup: handle cxx flag --- 3rdparty/ps-lite | 2 +- setup.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite index 11ba01b17..5da3f6859 160000 --- a/3rdparty/ps-lite +++ b/3rdparty/ps-lite @@ -1 +1 @@ -Subproject commit 11ba01b173f5c034e086b098f488e73ce757eef6 +Subproject commit 5da3f685941977d1a7f24ba2ea6a201c4c68ccc4 diff --git a/setup.py b/setup.py index 9d3c835eb..b165d7300 100644 --- a/setup.py +++ b/setup.py @@ -790,7 +790,43 @@ def build_extensions(self): if os.environ.get('CI', 'false') == 'false': make_option += "-j " if has_rdma_header(): - make_option += "USE_RDMA=1 " + make_option += "USE_RDMA=1 " + + # To resolve tf-gcc incompatibility + has_cxx_flag = False + glibcxx_flag = False + if not int(os.environ.get('BYTEPS_WITHOUT_TENSORFLOW', 0)): + try: + import tensorflow as tf + make_option += 'ADD_CFLAGS="' + for flag in tf.sysconfig.get_compile_flags(): + if 'D_GLIBCXX_USE_CXX11_ABI' in flag: + has_cxx_flag = True + glibcxx_flag = False if (flag[-1]=='0') else True + make_option += flag + ' ' + make_option += '"' + except: + pass + + # To resolve torch-gcc incompatibility + if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): + try: + import torch + torch_flag = torch.compiled_with_cxx11_abi() + if has_cxx_flag: + if glibcxx_flag != torch_flag: + raise DistutilsError( + '-D_GLIBCXX_USE_CXX11_ABI is not consistent between TensorFlow and PyTorch, ' + 'consider install them separately.') + else: + pass + else: + make_option += 'ADD_CFLAGS=-D_GLIBCXX_USE_CXX11_ABI=' + \ + str(int(torch_flag)) + ' ' + has_cxx_flag = True + glibcxx_flag = torch_flag + except: + pass make_process = subprocess.Popen('make ' + make_option, cwd='3rdparty/ps-lite', @@ -804,8 +840,10 @@ def build_extensions(self): 'Exit code: {0}'.format(make_process.returncode)) options = get_common_options(self) - built_plugins = [] + if has_cxx_flag: + options['COMPILE_FLAGS'] += ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(glibcxx_flag))] + built_plugins = [] try: build_server(self, options) except: From 95ce7bd826a518fe7ce0ee68b2a2e576cdffca3b Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 09:42:20 +0800 Subject: [PATCH 02/11] automatically add cuda lib for mxnet --- setup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/setup.py b/setup.py index b165d7300..b712d6b70 100644 --- a/setup.py +++ b/setup.py @@ -624,6 +624,12 @@ def get_nccl_vals(): def build_mx_extension(build_ext, options): # clear ROLE -- installation does not need this os.environ.pop("DMLC_ROLE", None) + + # fix "libcuda.so.1 not found" issue + cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda/') + ln_command = "ln -sf " + cuda_home + "lib64/stubs/libcuda.so /usr/lib/libcuda.so.1" + os.system(ln_command) + check_mx_version() mx_compile_flags, mx_link_flags = get_mx_flags( build_ext, options['COMPILE_FLAGS']) @@ -669,6 +675,8 @@ def build_mx_extension(build_ext, options): build_ext.build_extension(mxnet_lib) + os.system("rm -rf /usr/lib/libcuda.so.1") + def dummy_import_torch(): try: From eb0bc48bd31b3660d9de716dd85625c941740c59 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 09:56:13 +0800 Subject: [PATCH 03/11] fix softlink location --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index b712d6b70..c43a69605 100644 --- a/setup.py +++ b/setup.py @@ -626,8 +626,9 @@ def build_mx_extension(build_ext, options): os.environ.pop("DMLC_ROLE", None) # fix "libcuda.so.1 not found" issue - cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda/') - ln_command = "ln -sf " + cuda_home + "lib64/stubs/libcuda.so /usr/lib/libcuda.so.1" + cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda') + cuda_stub_path = cuda_home + '/lib64/stubs' + ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" os.system(ln_command) check_mx_version() @@ -675,7 +676,7 @@ def build_mx_extension(build_ext, options): build_ext.build_extension(mxnet_lib) - os.system("rm -rf /usr/lib/libcuda.so.1") + os.system("rm -rf " + cuda_stub_path + "/libcuda.so.1") def dummy_import_torch(): From 16df4beb1a6e2c67f8de92d29e4995d67861a3fd Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 10:03:40 +0800 Subject: [PATCH 04/11] fix os path --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c43a69605..82137ea9e 100644 --- a/setup.py +++ b/setup.py @@ -629,6 +629,7 @@ def build_mx_extension(build_ext, options): cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda') cuda_stub_path = cuda_home + '/lib64/stubs' ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" + os.environ["PATH"] += os.pathsep + cuda_stub_path os.system(ln_command) check_mx_version() From d6b88c00835469aee5fc502c3bcb90f7eb3ddbd1 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 10:08:32 +0800 Subject: [PATCH 05/11] fix ld_library_path --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 82137ea9e..e23f55bbc 100644 --- a/setup.py +++ b/setup.py @@ -629,7 +629,7 @@ def build_mx_extension(build_ext, options): cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda') cuda_stub_path = cuda_home + '/lib64/stubs' ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" - os.environ["PATH"] += os.pathsep + cuda_stub_path + os.environ["LD_LIBRARY_PATH"] += ":" + cuda_stub_path os.system(ln_command) check_mx_version() From 86e41981190912e2ce61cd1aabb6d722d6fd5b39 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 11:00:15 +0800 Subject: [PATCH 06/11] fix cuda default path --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e23f55bbc..fac6ee380 100644 --- a/setup.py +++ b/setup.py @@ -626,10 +626,9 @@ def build_mx_extension(build_ext, options): os.environ.pop("DMLC_ROLE", None) # fix "libcuda.so.1 not found" issue - cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/lib/cuda') + cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/local/cuda') cuda_stub_path = cuda_home + '/lib64/stubs' ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" - os.environ["LD_LIBRARY_PATH"] += ":" + cuda_stub_path os.system(ln_command) check_mx_version() From 735a60840e5de7802a83e8ae9a6a121106cc67b9 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 11:14:35 +0800 Subject: [PATCH 07/11] refactor dockerfilie --- docker/Dockerfile | 55 ++++++++++++++++ docker/Dockerfile.mxnet | 116 --------------------------------- docker/Dockerfile.pytorch | 120 ----------------------------------- docker/Dockerfile.tensorflow | 119 ---------------------------------- docker/README.md | 14 ++-- 5 files changed, 62 insertions(+), 362 deletions(-) create mode 100644 docker/Dockerfile delete mode 100644 docker/Dockerfile.mxnet delete mode 100644 docker/Dockerfile.pytorch delete mode 100644 docker/Dockerfile.tensorflow diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 000000000..0acbe74b2 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,55 @@ +FROM nvidia/cuda:10.0-devel-ubuntu18.04 + +ARG https_proxy +ARG http_proxy + +ARG BYTEPS_BASE_PATH=/usr/local +ARG BYTEPS_PATH=$BYTEPS_BASE_PATH/byteps +ARG BYTEPS_GIT_LINK=https://github.com/bytedance/byteps +ARG BYTEPS_BRANCH=master + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update +RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + tzdata \ + ca-certificates \ + git \ + curl \ + wget \ + vim \ + cmake \ + lsb-release \ + libcudnn7=7.6.0.64-1+cuda10.0 \ + libnuma-dev \ + ibverbs-providers \ + librdmacm-dev \ + ibverbs-utils \ + rdmacm-utils \ + libibverbs-dev \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools \ + libnccl2=2.4.8-1+cuda10.1 \ + libnccl-dev=2.4.8-1+cuda10.1 + +# install framework +ARG FRAMEWORK=tensorflow +RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \ + pip3 install -U tensorflow-gpu==1.14.0; \ + elif [ "$FRAMEWORK" = "pytorch" ]; then \ + pip3 install -U numpy torch==1.4.0; \ + elif [ "$FRAMEWORK" = "mxnet" ]; then \ + pip3 install -U mxnet-cu100==1.5.0; \ + else \ + echo "unknown framework: $FRAMEWORK"; \ + exit 1; \ + fi + +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH + +RUN cd $BYTEPS_BASE_PATH &&\ + git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK &&\ + cd $BYTEPS_PATH &&\ + python3 setup.py install diff --git a/docker/Dockerfile.mxnet b/docker/Dockerfile.mxnet deleted file mode 100644 index 9854fcb7a..000000000 --- a/docker/Dockerfile.mxnet +++ /dev/null @@ -1,116 +0,0 @@ -FROM nvidia/cuda:10.0-devel-ubuntu18.04 - -ARG https_proxy -ARG http_proxy - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update -qq -RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - tzdata \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - cmake \ - lsb-release \ - libcudnn7=7.6.0.64-1+cuda10.0 \ - libnuma-dev \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev \ - python3 \ - python3-dev \ - python3-pip \ - python3-setuptools - -# Install NCCL -ENV NCCL_VERSION=7c72dee660e4d055b81721dd6b03e4e1c0a983cf -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - -WORKDIR /root/ - -# install gcc 4.9 -RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ - wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728424/libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728426/libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728314/gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728399/cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728404/gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728432/libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728401/g++-4.9_4.9.3-13ubuntu2_amd64.deb - -RUN cd /root/gcc &&\ - dpkg -i gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libmpfr4_3.1.4-1_amd64.deb &&\ - dpkg -i libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i g++-4.9_4.9.3-13ubuntu2_amd64.deb - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - -# install mxnet -ARG FRAMEWORK_VERSION=1.5.0 -RUN python3 -m pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -ARG BYTEPS_USE_RDMA=1 -ARG BYTEPS_WITHOUT_PYTORCH=1 -ARG BYTEPS_WITHOUT_TENSORFLOW=1 -ARG BYTEPS_BRANCH=master -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK -RUN cd $BYTEPS_PATH &&\ - python3 setup.py install - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 diff --git a/docker/Dockerfile.pytorch b/docker/Dockerfile.pytorch deleted file mode 100644 index d7fc0c9ad..000000000 --- a/docker/Dockerfile.pytorch +++ /dev/null @@ -1,120 +0,0 @@ -FROM nvidia/cuda:10.0-devel-ubuntu18.04 - -ARG https_proxy -ARG http_proxy - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update -qq -RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - tzdata \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - cmake \ - lsb-release \ - libcudnn7=7.6.0.64-1+cuda10.0 \ - libnuma-dev \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev \ - python3 \ - python3-dev \ - python3-pip \ - python3-setuptools - -# Install NCCL -ENV NCCL_VERSION=7c72dee660e4d055b81721dd6b03e4e1c0a983cf -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - -WORKDIR /root/ - -# install gcc 4.9 -RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ - wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728424/libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728426/libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728314/gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728399/cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728404/gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728432/libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728401/g++-4.9_4.9.3-13ubuntu2_amd64.deb - -RUN cd /root/gcc &&\ - dpkg -i gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libmpfr4_3.1.4-1_amd64.deb &&\ - dpkg -i libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i g++-4.9_4.9.3-13ubuntu2_amd64.deb - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - -# install pytorch -ARG FRAMEWORK_VERSION=1.1.0 -ARG TORCHVISION_VERSION=0.2.2 -ARG PILLOW_VERSION=6.1 -RUN python3 -m pip --no-cache-dir install torch==$FRAMEWORK_VERSION torchvision==$TORCHVISION_VERSION Pillow==$PILLOW_VERSION - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -ARG BYTEPS_USE_RDMA=1 -ARG BYTEPS_WITHOUT_TENSORFLOW=1 -ARG BYTEPS_WITHOUT_MXNET=1 -ARG BYTEPS_BRANCH=master -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK -RUN cd $BYTEPS_PATH &&\ - python3 setup.py install - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.tensorflow b/docker/Dockerfile.tensorflow deleted file mode 100644 index 5873d649a..000000000 --- a/docker/Dockerfile.tensorflow +++ /dev/null @@ -1,119 +0,0 @@ -FROM nvidia/cuda:10.0-devel-ubuntu18.04 - -ARG https_proxy -ARG http_proxy - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update -qq -RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - tzdata \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - cmake \ - lsb-release \ - libcudnn7=7.6.0.64-1+cuda10.0 \ - libnuma-dev \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev \ - python3 \ - python3-dev \ - python3-pip \ - python3-setuptools - -# Install NCCL -ENV NCCL_VERSION=7c72dee660e4d055b81721dd6b03e4e1c0a983cf -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - -WORKDIR /root/ - -# install gcc 4.9 -RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ - wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728424/libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728426/libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728314/gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728399/cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728404/gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728432/libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - wget http://launchpadlibrarian.net/253728401/g++-4.9_4.9.3-13ubuntu2_amd64.deb - -RUN cd /root/gcc &&\ - dpkg -i gcc-4.9-base_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libmpfr4_3.1.4-1_amd64.deb &&\ - dpkg -i libasan1_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libgcc-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i cpp-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i gcc-4.9_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i libstdc++-4.9-dev_4.9.3-13ubuntu2_amd64.deb &&\ - dpkg -i g++-4.9_4.9.3-13ubuntu2_amd64.deb - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - -# install tensorflow -ARG FRAMEWORK_VERSION=1.14.0 -RUN python3 -m pip --no-cache-dir install tensorflow-gpu==$FRAMEWORK_VERSION &&\ - rm -rf /tmp/pip && rm -rf /root/.cache - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -ARG BYTEPS_USE_RDMA=1 -ARG BYTEPS_WITHOUT_PYTORCH=1 -ARG BYTEPS_WITHOUT_MXNET=1 -ARG BYTEPS_BRANCH=master -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK -RUN cd $BYTEPS_PATH &&\ - python3 setup.py install - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/README.md b/docker/README.md index 16b8ddcaf..f8c85e7f4 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,10 +1,10 @@ # Prebuilt Images -Belows are prebuilt docker images, and their associated source dockerfiles. These prebuilt images might not be up-to-date. -You may need to manually build them to get the latest functionalities of BytePS, using the source dockerfiles. +Belows are prebuilt docker images, and their associated commands to build. These prebuilt images might not be up-to-date. +You may need to manually build them to get the latest functionalities of BytePS using the dockerfile. -| Docker Image Name | Source Dockerfile | Description | -| --- | --- | --- | -| bytepsimage/mxnet | Dockerfile.mxnet | Image for MXNet | -| bytepsimage/pytorch | Dockerfile.pytorch | Image for PyTorch | -| bytepsimage/tensorflow | Dockerfile.tensorflow | Image for TensorFlow | +| Docker image | How to build | +| --- | --- | +| bytepsimage/tensorflow | docker build -t bytepsimage/tensorflow . -f Dockerfile --build-arg FRAMEWORK=tensorflow | +| bytepsimage/pytorch | docker build -t bytepsimage/pytorch . -f Dockerfile --build-arg FRAMEWORK=pytorch | +| bytepsimage/mxnet | docker build -t bytepsimage/mxnet . -f Dockerfile --build-arg FRAMEWORK=mxnet | From f57bd8773b150495614c79ea8e8faefe3fd26098 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sat, 29 Feb 2020 12:26:07 +0800 Subject: [PATCH 08/11] set proper nccl version --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0acbe74b2..6de7bc1d3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -31,8 +31,8 @@ RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-inst python3-dev \ python3-pip \ python3-setuptools \ - libnccl2=2.4.8-1+cuda10.1 \ - libnccl-dev=2.4.8-1+cuda10.1 + libnccl2=2.4.7-1+cuda10.0 \ + libnccl-dev=2.4.7-1+cuda10.0 # install framework ARG FRAMEWORK=tensorflow From 070d6501083a27ea88039e12343ff84892d3992b Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sun, 1 Mar 2020 11:36:53 +0800 Subject: [PATCH 09/11] add torchvision --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6de7bc1d3..16e6f8f22 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -39,7 +39,7 @@ ARG FRAMEWORK=tensorflow RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \ pip3 install -U tensorflow-gpu==1.14.0; \ elif [ "$FRAMEWORK" = "pytorch" ]; then \ - pip3 install -U numpy torch==1.4.0; \ + pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \ elif [ "$FRAMEWORK" = "mxnet" ]; then \ pip3 install -U mxnet-cu100==1.5.0; \ else \ From 0b1f818ec227016234ef5cab8be337c9b0a9da8f Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sun, 1 Mar 2020 14:02:55 +0800 Subject: [PATCH 10/11] fix setup and use tf 1.15 for dockerfile --- docker/Dockerfile | 4 ++- setup.py | 79 +++++++++++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 16e6f8f22..5b63e5727 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -35,9 +35,11 @@ RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-inst libnccl-dev=2.4.7-1+cuda10.0 # install framework +# note: for tf <= 1.14, you need gcc-4.9 ARG FRAMEWORK=tensorflow RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \ - pip3 install -U tensorflow-gpu==1.14.0; \ + pip3 install --upgrade pip; \ + pip3 install -U tensorflow-gpu==1.15.0; \ elif [ "$FRAMEWORK" = "pytorch" ]; then \ pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \ elif [ "$FRAMEWORK" = "mxnet" ]; then \ diff --git a/setup.py b/setup.py index fac6ee380..48874a902 100644 --- a/setup.py +++ b/setup.py @@ -622,6 +622,9 @@ def get_nccl_vals(): def build_mx_extension(build_ext, options): + # try to raise exception in the begining + import mxnet + # clear ROLE -- installation does not need this os.environ.pop("DMLC_ROLE", None) @@ -793,49 +796,51 @@ def build_torch_extension(build_ext, options, torch_version): # run the customize_compiler class custom_build_ext(build_ext): def build_extensions(self): + make_option = "" + # To resolve tf-gcc incompatibility + has_cxx_flag = False + glibcxx_flag = False + if not int(os.environ.get('BYTEPS_WITHOUT_TENSORFLOW', 0)): + try: + import tensorflow as tf + make_option += 'ADD_CFLAGS="' + for flag in tf.sysconfig.get_compile_flags(): + if 'D_GLIBCXX_USE_CXX11_ABI' in flag: + has_cxx_flag = True + glibcxx_flag = False if (flag[-1]=='0') else True + make_option += flag + ' ' + break + make_option += '" ' + except: + pass + + # To resolve torch-gcc incompatibility + if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): + try: + import torch + torch_flag = torch.compiled_with_cxx11_abi() + if has_cxx_flag: + if glibcxx_flag != torch_flag: + raise DistutilsError( + '-D_GLIBCXX_USE_CXX11_ABI is not consistent between TensorFlow and PyTorch, ' + 'consider install them separately.') + else: + pass + else: + make_option += 'ADD_CFLAGS=-D_GLIBCXX_USE_CXX11_ABI=' + \ + str(int(torch_flag)) + ' ' + has_cxx_flag = True + glibcxx_flag = torch_flag + except: + pass + if not os.path.exists("3rdparty/ps-lite/build/libps.a") or \ not os.path.exists("3rdparty/ps-lite/deps/lib"): - make_option = "" if os.environ.get('CI', 'false') == 'false': make_option += "-j " if has_rdma_header(): make_option += "USE_RDMA=1 " - - # To resolve tf-gcc incompatibility - has_cxx_flag = False - glibcxx_flag = False - if not int(os.environ.get('BYTEPS_WITHOUT_TENSORFLOW', 0)): - try: - import tensorflow as tf - make_option += 'ADD_CFLAGS="' - for flag in tf.sysconfig.get_compile_flags(): - if 'D_GLIBCXX_USE_CXX11_ABI' in flag: - has_cxx_flag = True - glibcxx_flag = False if (flag[-1]=='0') else True - make_option += flag + ' ' - make_option += '"' - except: - pass - - # To resolve torch-gcc incompatibility - if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): - try: - import torch - torch_flag = torch.compiled_with_cxx11_abi() - if has_cxx_flag: - if glibcxx_flag != torch_flag: - raise DistutilsError( - '-D_GLIBCXX_USE_CXX11_ABI is not consistent between TensorFlow and PyTorch, ' - 'consider install them separately.') - else: - pass - else: - make_option += 'ADD_CFLAGS=-D_GLIBCXX_USE_CXX11_ABI=' + \ - str(int(torch_flag)) + ' ' - has_cxx_flag = True - glibcxx_flag = torch_flag - except: - pass + make_process = subprocess.Popen('make ' + make_option, cwd='3rdparty/ps-lite', From 51497352aab798e618d4f8fa8530cbee0277d3b0 Mon Sep 17 00:00:00 2001 From: jiangyimin Date: Sun, 1 Mar 2020 14:50:46 +0800 Subject: [PATCH 11/11] improve handling of libcuda.so.1 --- setup.py | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index 48874a902..c2f84becc 100644 --- a/setup.py +++ b/setup.py @@ -622,18 +622,9 @@ def get_nccl_vals(): def build_mx_extension(build_ext, options): - # try to raise exception in the begining - import mxnet - # clear ROLE -- installation does not need this os.environ.pop("DMLC_ROLE", None) - # fix "libcuda.so.1 not found" issue - cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/local/cuda') - cuda_stub_path = cuda_home + '/lib64/stubs' - ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" - os.system(ln_command) - check_mx_version() mx_compile_flags, mx_link_flags = get_mx_flags( build_ext, options['COMPILE_FLAGS']) @@ -679,8 +670,6 @@ def build_mx_extension(build_ext, options): build_ext.build_extension(mxnet_lib) - os.system("rm -rf " + cuda_stub_path + "/libcuda.so.1") - def dummy_import_torch(): try: @@ -880,31 +869,38 @@ def build_extensions(self): built_plugins.append(False) else: raise - if not int(os.environ.get('BYTEPS_WITHOUT_MXNET', 0)): + if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): try: - build_mx_extension(self, options) + torch_version = check_torch_version() + build_torch_extension(self, options, torch_version) built_plugins.append(True) - print('INFO: MXNet extension is built successfully.') + print('INFO: PyTorch extension is built successfully.') except: - if not int(os.environ.get('BYTEPS_WITH_MXNET', 0)): - print('INFO: Unable to build MXNet plugin, will skip it.\n\n' + if not int(os.environ.get('BYTEPS_WITH_PYTORCH', 0)): + print('INFO: Unable to build PyTorch plugin, will skip it.\n\n' '%s' % traceback.format_exc()) built_plugins.append(False) else: raise - if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): + if not int(os.environ.get('BYTEPS_WITHOUT_MXNET', 0)): + # fix "libcuda.so.1 not found" issue + cuda_home = os.environ.get('BYTEPS_CUDA_HOME', '/usr/local/cuda') + cuda_stub_path = cuda_home + '/lib64/stubs' + ln_command = "cd " + cuda_stub_path + "; ln -sf libcuda.so libcuda.so.1" + os.system(ln_command) try: - torch_version = check_torch_version() - build_torch_extension(self, options, torch_version) + build_mx_extension(self, options) built_plugins.append(True) - print('INFO: PyTorch extension is built successfully.') + print('INFO: MXNet extension is built successfully.') except: - if not int(os.environ.get('BYTEPS_WITH_PYTORCH', 0)): - print('INFO: Unable to build PyTorch plugin, will skip it.\n\n' + if not int(os.environ.get('BYTEPS_WITH_MXNET', 0)): + print('INFO: Unable to build MXNet plugin, will skip it.\n\n' '%s' % traceback.format_exc()) built_plugins.append(False) else: raise + finally: + os.system("rm -rf " + cuda_stub_path + "/libcuda.so.1") if not built_plugins: print('INFO: Only server module is built.')