Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
7fc3065
Add torch-latest and torch-nightly CI workflows (#1990)
mrwyattii Jun 6, 2022
3678ee1
[bug] Add user-defined launcher args for MPI launcher (#1933)
mrwyattii Jun 6, 2022
d0eae5a
Propagate max errorcode to deepspeed when using PDSH launcher (#1994)
jerrymannil Jun 7, 2022
828ab71
[docs] add new build badges to landing page (#1998)
jeffra Jun 7, 2022
36ad311
DeepSpeed comm backend v1 (#1985)
awan-10 Jun 10, 2022
25b2fc2
Relax assertion to allow Megatron-DeepSpeed MoE to use ZeRO-1 (#2007)
Quentin-Anthony Jun 13, 2022
117c9cd
update CODEOWNERS (#2017)
conglongli Jun 14, 2022
e6f444a
[CI] force upgrade HF dependencies & output py env (#2015)
jeffra Jun 15, 2022
b666d5c
[inference] test suite for ds-kernels (bert, roberta, gpt2, gpt-neo, …
jeffra Jun 15, 2022
7c3344e
DeepSpeed examples refresh (#2021)
jeffra Jun 16, 2022
5dce73f
Fix transformer API for training-evaluation pipeline (#2018)
RezaYazdaniAminabadi Jun 16, 2022
ae198e2
DataLoader Length Fix (#1718)
Sanger2000 Jun 16, 2022
c87f6ee
DeepSpeed Monitor Module (Master) (#2013)
Quentin-Anthony Jun 16, 2022
d86a2de
Use partition size (#2011)
tjruwase Jun 20, 2022
735406e
fix import errors (#2026)
KMFODA Jun 20, 2022
ec1ec20
Fix inference unit test import error catching (#2024)
mrwyattii Jun 21, 2022
2a1a409
Retain available params until last use (#2016)
tjruwase Jun 21, 2022
678c3fe
Split parameter offload from z3 (#2009)
tjruwase Jun 21, 2022
5218177
fixed print statement (#2038)
mrwyattii Jun 22, 2022
ff87c4e
Add compression papers (#2042)
conglongli Jun 22, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ jobs:
sudo apt-get update
sudo apt-get install -y libaio-dev

- name: Python environment
run: |
pip list

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
Expand All @@ -59,5 +63,5 @@ jobs:
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
63 changes: 63 additions & 0 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: nv-inference

on:
push:
branches:
- 'master'
- 'staging**'
paths-ignore:
- 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn,inf]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'inference' unit/ --torch_ver="1.8" --cuda_ver="11.1"
12 changes: 11 additions & 1 deletion .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, torch18, v100]
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -29,16 +29,26 @@ jobs:
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,autotuning]
ds_report

- name: PyTorch Lightning Tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
pip uninstall --yes pytorch-lightning
pip install pytorch-lightning
pip install "protobuf<4.21.0"
cd tests
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: nv-nightly

on:
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn,inf]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.8" --cuda_ver="11.1"
64 changes: 64 additions & 0 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: nv-torch-latest-v100

on:
push:
branches:
- 'master'
- 'staging**'
paths-ignore:
- 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu113, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
57 changes: 57 additions & 0 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: nv-torch-nightly-v100

on:
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu113, v100]

steps:
- uses: actions/checkout@v2

- name: environment
run: |
nvidia-smi
which python
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu113
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn]
ds_report

- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
15 changes: 12 additions & 3 deletions .github/workflows/nv-torch12-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, torch12, p40]
runs-on: [self-hosted, nvidia, cu101, p40]

steps:
- uses: actions/checkout@v2
Expand All @@ -29,25 +29,34 @@ jobs:
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.2.0 torchvision==0.4.0
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

- name: Python environment
run: |
pip list

- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Install deepspeed
run: |
pip install .[dev,autotuning]
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn]
ds_report

- name: Unit tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.2" --cuda_ver="10"
14 changes: 11 additions & 3 deletions .github/workflows/nv-torch18-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, torch18, v100]
runs-on: [self-hosted, nvidia, cu111, v100]

steps:
- uses: actions/checkout@v2
Expand All @@ -29,6 +29,8 @@ jobs:
python --version
which nvcc
nvcc --version
pip install --upgrade pip
pip uninstall --yes torch torchvision
pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -40,10 +42,16 @@ jobs:
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip uninstall --yes transformers
pip install .

- name: Python environment
run: |
pip list

- name: Install deepspeed
run: |
pip uninstall --yes deepspeed
pip install .[dev,1bit,autotuning,sparse_attn]
ds_report

Expand All @@ -52,5 +60,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.8" --cuda_ver="11.1"
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11.1"
Loading