trajepl · trajepl · Jun 23, 2022 · Jun 6, 2022 · Jun 6, 2022 · Jun 7, 2022
diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml
@@ -38,6 +38,10 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y libaio-dev
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Install transformers
         run: |
           git clone https://github.com/huggingface/transformers
@@ -59,5 +63,5 @@ jobs:
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 unit/
           TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -0,0 +1,63 @@
+name: nv-inference
+
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn,inf]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'inference' unit/ --torch_ver="1.8" --cuda_ver="11.1"
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, torch18, v100]
+    runs-on: [self-hosted, nvidia, cu111, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -29,16 +29,26 @@ jobs:
           python --version
           which nvcc
           nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
           pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Install deepspeed
         run: |
+          pip uninstall --yes deepspeed
           pip install .[dev,autotuning]
           ds_report
+
       - name: PyTorch Lightning Tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          pip uninstall --yes pytorch-lightning
           pip install pytorch-lightning
           pip install "protobuf<4.21.0"
           cd tests

diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
@@ -0,0 +1,52 @@
+name: nv-nightly
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn,inf]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/ --torch_ver="1.8" --cuda_ver="11.1"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -0,0 +1,64 @@
+name: nv-torch-latest-v100
+
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu113, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
@@ -0,0 +1,57 @@
+name: nv-torch-nightly-v100
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu113, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
diff --git a/.github/workflows/nv-torch12-p40.yml b/.github/workflows/nv-torch12-p40.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, torch12, p40]
+    runs-on: [self-hosted, nvidia, cu101, p40]
 
     steps:
       - uses: actions/checkout@v2
@@ -29,25 +29,34 @@ jobs:
           python --version
           which nvcc
           nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.2.0 torchvision==0.4.0
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Install transformers
         run: |
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
           # git checkout 1cc453d33
           git rev-parse --short HEAD
+          pip uninstall --yes transformers
           pip install .
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,autotuning]
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn]
           ds_report
 
       - name: Unit tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/ --torch_ver="1.2" --cuda_ver="10"
diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch18-v100.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, torch18, v100]
+    runs-on: [self-hosted, nvidia, cu111, v100]
 
     steps:
       - uses: actions/checkout@v2
@@ -29,6 +29,8 @@ jobs:
           python --version
           which nvcc
           nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
           pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
@@ -40,10 +42,16 @@ jobs:
           # if needed switch to the last known good SHA until transformers@master is fixed
           # git checkout 1cc453d33
           git rev-parse --short HEAD
+          pip uninstall --yes transformers
           pip install .
 
+      - name: Python environment
+        run: |
+          pip list
+
       - name: Install deepspeed
         run: |
+          pip uninstall --yes deepspeed
           pip install .[dev,1bit,autotuning,sparse_attn]
           ds_report
 
@@ -52,5 +60,5 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4  unit/ --torch_ver="1.8" --cuda_ver="11.1"
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/ --torch_ver="1.8" --cuda_ver="11.1"