From ceb0380c178598a7d488f0ecf7b2885422b73256 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Wed, 26 Jun 2019 21:35:14 -0700 Subject: [PATCH 01/11] ci: initial commit --- .travis.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..41256bc7e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,24 @@ +dist: bionic +python: + - "2.7" + - "3.4" + - "3.7" +env: + - CUDA=10.1.105-1 + - CUDA_APT=10-1 + - CUDA_SHORT=10.1 + - UBUNTU_VERSION=ubuntu1804 +before_install: + - CUDA_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb + - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER} + - sudo dpkg -i ${INSTALLER} + - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub + - sudo apt-key add 7fa2af80.pub + - sudo apt update -qq + - sudo apt install -y cuda-core-${CUDA_APT} cuda-cudart-dev-${CUDA_APT} cuda-cufft-dev-${CUDA_APT} libnccl2 libnccl-dev + - sudo apt clean + - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} + - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} + - PATH=${CUDA_HOME}/bin:${PATH} +install: + - python setup.py install From b849ca4917ec9e9b925b28356e9382ba976f62e9 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Wed, 26 Jun 2019 22:01:52 -0700 Subject: [PATCH 02/11] ci: update env --- .travis.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 41256bc7e..679e49a5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,15 @@ dist: bionic +language: python python: - "2.7" - "3.4" - "3.7" env: - - CUDA=10.1.105-1 - - CUDA_APT=10-1 - - CUDA_SHORT=10.1 - - UBUNTU_VERSION=ubuntu1804 + - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 before_install: - CUDA_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb - - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER} - - sudo dpkg -i ${INSTALLER} + - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_PKG} + - sudo dpkg -i ${CUDA_PKG} - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub - sudo apt-key add 7fa2af80.pub - sudo apt update -qq @@ -21,4 +19,4 @@ before_install: - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} - PATH=${CUDA_HOME}/bin:${PATH} install: - - python setup.py install + - python setup.py install From 54a607421f72db9c35225077bee58b2d3a7866f1 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 10:31:07 -0700 Subject: [PATCH 03/11] ci: nccl --- .travis.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 679e49a5c..d7847e6fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,13 +5,16 @@ python: - "3.4" - "3.7" env: - - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 + - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 NCCL=1.0.0-1 before_install: - - CUDA_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb - - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_PKG} - - sudo dpkg -i ${CUDA_PKG} + - CUDA_REPO=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb + - NCCL_REPO=nvidia-machine-learning-repo-${UBUNTU_VERSION}_${NCCL}_amd64.deb + - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_REPO} + - sudo dpkg -i ${CUDA_REPO} - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub - sudo apt-key add 7fa2af80.pub + - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO} + - sudo dpkg -i ${NCCL_REPO} - sudo apt update -qq - sudo apt install -y cuda-core-${CUDA_APT} cuda-cudart-dev-${CUDA_APT} cuda-cufft-dev-${CUDA_APT} libnccl2 libnccl-dev - sudo apt clean From b1a107718623b4f7f2a93462ae7d56800c22e272 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 10:34:46 -0700 Subject: [PATCH 04/11] ci: remove python 3.4 --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d7847e6fe..972fd0694 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ dist: bionic language: python python: - "2.7" - - "3.4" - "3.7" env: - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 NCCL=1.0.0-1 From 106925d05746eae451e8933ec7c030fc2e772ce2 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 11:14:51 -0700 Subject: [PATCH 05/11] ci: disable parallel build in ci --- setup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 4889fa3f7..9df9176da 100644 --- a/setup.py +++ b/setup.py @@ -755,10 +755,13 @@ class custom_build_ext(build_ext): def build_extensions(self): if not os.path.exists("3rdparty/ps-lite/build/libps.a") or \ not os.path.exists("3rdparty/ps-lite/deps/lib"): - str_rdma_option = "" + make_option = "" + if os.environ.get('CI', 'false') == 'false': + make_option += "-j " if int(os.environ.get('BYTEPS_USE_RDMA', 0)): - str_rdma_option += "USE_RDMA=1" - make_process = subprocess.Popen('make -j ' + str_rdma_option, + make_option += "USE_RDMA=1 " + + make_process = subprocess.Popen('make ' + make_option, cwd='3rdparty/ps-lite', stdout=sys.stdout, stderr=sys.stderr, From a8ae9f08465012f231f29fe218e5ea39c9a5086f Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 11:59:36 -0700 Subject: [PATCH 06/11] ci: install frameworks --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 972fd0694..a69fc1ebb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,5 +20,6 @@ before_install: - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} - PATH=${CUDA_HOME}/bin:${PATH} + - pip install mxnet-cu101 tensorflow-gpu torch torchvision install: - python setup.py install From cf23cc0f0ab3e8e65a707a1900005c06ea8931f9 Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 12:20:57 -0700 Subject: [PATCH 07/11] ci: set BYTEPS_CUDA_HOME --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index a69fc1ebb..ee1d2b1a5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ dist: bionic language: python +cache: pip python: - "2.7" - "3.7" @@ -22,4 +23,5 @@ before_install: - PATH=${CUDA_HOME}/bin:${PATH} - pip install mxnet-cu101 tensorflow-gpu torch torchvision install: + - BYTEPS_CUDA_HOME=${CUDA_HOME} - python setup.py install From 60d31326705095b4307666e5211293f38665130c Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 13:59:06 -0700 Subject: [PATCH 08/11] ci: fix dependency --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index ee1d2b1a5..b4e410467 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,12 +11,11 @@ before_install: - NCCL_REPO=nvidia-machine-learning-repo-${UBUNTU_VERSION}_${NCCL}_amd64.deb - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_REPO} - sudo dpkg -i ${CUDA_REPO} - - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub - - sudo apt-key add 7fa2af80.pub + - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO} - sudo dpkg -i ${NCCL_REPO} - sudo apt update -qq - - sudo apt install -y cuda-core-${CUDA_APT} cuda-cudart-dev-${CUDA_APT} cuda-cufft-dev-${CUDA_APT} libnccl2 libnccl-dev + - sudo apt install -y cuda libnccl2 libnccl-dev - sudo apt clean - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} From c6c8e2eb9aab31c00d4fcc741ac6f6ef24eeca2b Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 14:20:37 -0700 Subject: [PATCH 09/11] ci: numa --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b4e410467..d8550aa7b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ before_install: - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO} - sudo dpkg -i ${NCCL_REPO} - sudo apt update -qq - - sudo apt install -y cuda libnccl2 libnccl-dev + - sudo apt install -y cuda libnccl2 libnccl-dev libnuma-dev - sudo apt clean - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} From e12512651b9ec3ec897eff480ad6a6879eb2977a Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 15:09:42 -0700 Subject: [PATCH 10/11] ci: lifecycle --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d8550aa7b..1a9662e73 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ before_install: - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO} - sudo dpkg -i ${NCCL_REPO} +install: - sudo apt update -qq - sudo apt install -y cuda libnccl2 libnccl-dev libnuma-dev - sudo apt clean @@ -21,6 +22,6 @@ before_install: - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} - PATH=${CUDA_HOME}/bin:${PATH} - pip install mxnet-cu101 tensorflow-gpu torch torchvision -install: +script: - BYTEPS_CUDA_HOME=${CUDA_HOME} - python setup.py install From d9309991101dc2c13aaa60c1e3f0cdec0b440fbb Mon Sep 17 00:00:00 2001 From: Chang Lan Date: Thu, 27 Jun 2019 15:32:35 -0700 Subject: [PATCH 11/11] ci: readme badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ffee3b22d..897bcd6dd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # BytePS +[![Build Status](https://travis-ci.org/bytedance/byteps.svg?branch=master)](https://travis-ci.org/bytedance/byteps) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) BytePS is a high performance and general distributed training framework. It supports TensorFlow, Keras, PyTorch, and MXNet, and can run on either TCP or RDMA network.