Ported gpu hvd tests from circleci to GHA (#2619)

vfdev-5 · web-flow · commit f2b11832549b · 2023-08-29T13:09:33.000+02:00
* Ported gpu hvd tests from circleci to GHA

* Fixing HVD installation

* WIP replaced pip with conda

* Updates

* Update gpu-hvd-tests.yml

* Update gpu-hvd-tests.yml

* More updates

* Do not build with NCCL

* Updated cache name

* WIP

* Use AGENT_TOOLSDIRECTORY for isolation

* Fixing horovod installation

* Added cmake and scikit-build step

* Ported tests to pytorch infra

* Added set -e to gpu-*tests.yml

* Fixed issue with test__hvd_dist_model_warning_index_less_localrank
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
@@ -0,0 +1,188 @@
+name: Run HVD-specific unit tests on GPUs
+on:
+  push:
+    paths:
+      - "ignite/**"
+      - "tests/ignite/**"
+      - "tests/run_gpu_tests.sh"
+      - "tests/run_code_style.sh"
+      - "examples/**.py"
+      - "requirements-dev.txt"
+      - ".github/workflows/gpu-hvd-tests.yml"
+  workflow_dispatch:
+
+concurrency:
+  # <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>
+  group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
+  cancel-in-progress: true
+
+# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
+
+jobs:
+  gpu-hvd-tests:
+    strategy:
+      matrix:
+        pytorch-channel: [pytorch, ]
+      fail-fast: false
+    env:
+      DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8"
+      REPOSITORY: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    runs-on: linux.8xlarge.nvidia.gpu
+    timeout-minutes: 60
+
+    steps:
+      - name: Clean workspace
+        run: |
+          echo "::group::Cleanup debug output"
+          sudo rm -rfv "${GITHUB_WORKSPACE}"
+          mkdir -p "${GITHUB_WORKSPACE}"
+          echo "::endgroup::"
+
+      - name: Checkout repository (pytorch/test-infra)
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: pytorch/test-infra
+          path: test-infra
+
+      - name: Setup Linux
+        uses: ./test-infra/.github/actions/setup-linux
+
+      - name: Pull docker image
+        uses: ./test-infra/.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ env.DOCKER_IMAGE }}
+
+      - name: Checkout repository (${{ github.repository }})
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
+          path: ${{ github.repository }}
+          fetch-depth: 1
+
+      - name: Start Pytorch container
+        working-directory: ${{ github.repository }}
+        run: |
+          docker run --name pthd --gpus=all --rm \
+            --cap-add=SYS_PTRACE \
+            --detach \
+            --ipc=host \
+            --security-opt seccomp=unconfined \
+            --shm-size=2g \
+            --tty \
+            --ulimit stack=10485760:83886080 \
+            -v $PWD:/work \
+            -w /work \
+            ${DOCKER_IMAGE}
+
+          script=$(cat << EOF
+
+            set -xe
+
+            nvidia-smi
+            ls -alh
+
+            conda --version
+            python --version
+
+          EOF
+          )
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install PyTorch and dependencies
+        continue-on-error: false
+        run: |
+
+          script=$(cat << EOF
+
+          set -xe
+
+          # Install PyTorch
+          if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
+            pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          else
+            pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
+          fi
+
+          python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
+          pip list
+
+          # Install dependencies
+          pip install -r requirements-dev.txt
+          pip install -e .
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install Horovod with NCCL GPU ops
+        run: |
+          script=$(cat << EOF
+
+          set -xe
+
+          HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
+          horovodrun --check-build
+          pip list
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Run GPU and CPU Unit HVD Tests
+        run: |
+
+          script=$(cat << EOF
+
+          set -xe
+
+          bash tests/run_gpu_tests.sh 2 hvd
+          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          file: ${{ github.repository }}/coverage.xml
+          flags: gpu-2
+          fail_ci_if_error: false
+
+      - name: Run examples in container
+        continue-on-error: false
+        run: |
+          SCRIPT=$(cat << EOF
+
+          set -xe
+
+          # Install additional example dependencies
+          pip install fire
+
+          # Check training on CIFAR10, run with horovod backend using horovodrun
+          # initial run
+          CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500
+          # resume
+          CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
+
+          # Check training on CIFAR10 using spawn
+          # initial run
+          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
+          # resume
+          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Teardown Linux
+        if: ${{ always() }}
+        uses: ./test-infra/.github/actions/teardown-linux
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -19,7 +19,7 @@ concurrency:
 # Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
 
 jobs:
-  gpu-tests:    
+  gpu-tests:
     strategy:
       matrix:
         pytorch-channel: [pytorch, pytorch-nightly]
@@ -80,7 +80,7 @@ jobs:
 
           script=$(cat << EOF
 
-            set -x
+            set -xe
 
             nvidia-smi
             ls -alh
@@ -98,7 +98,7 @@ jobs:
 
           script=$(cat << EOF
 
-          set -x
+          set -xe
 
           # Install PyTorch
           if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
@@ -119,13 +119,13 @@ jobs:
 
           docker exec -t pthd /bin/bash -c "${script}"
 
-      - name: Run 1 Node 2 GPUs Unit Tests
+      - name: Run GPU Unit Tests
         continue-on-error: false
         run: |
 
           script=$(cat << EOF
 
-          set -x
+          set -xe
 
           bash tests/run_gpu_tests.sh 2
 
@@ -145,8 +145,8 @@ jobs:
         continue-on-error: false
         run: |
           SCRIPT=$(cat << EOF
-          
-          set -x
+
+          set -xe
 
           # Install additional example dependencies
           pip install fire
@@ -156,7 +156,7 @@ jobs:
           CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
           ## resume
           CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
-    
+
           # Check training on cifar10, run with NCCL backend using torchrun
           ## initial run
           CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py
@@ -184,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank():
 @pytest.mark.distributed
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
 def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor):
-    gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count())
+    gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count())
 
 
 def _test_dist_spawn_fn(local_rank, backend, world_size, device):