|
| 1 | +name: Run HVD-specific unit tests on GPUs |
| 2 | +on: |
| 3 | + push: |
| 4 | + paths: |
| 5 | + - "ignite/**" |
| 6 | + - "tests/ignite/**" |
| 7 | + - "tests/run_gpu_tests.sh" |
| 8 | + - "tests/run_code_style.sh" |
| 9 | + - "examples/**.py" |
| 10 | + - "requirements-dev.txt" |
| 11 | + - ".github/workflows/gpu-hvd-tests.yml" |
| 12 | + workflow_dispatch: |
| 13 | + |
| 14 | +concurrency: |
| 15 | + # <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)> |
| 16 | + group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} |
| 17 | + cancel-in-progress: true |
| 18 | + |
| 19 | +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml |
| 20 | + |
| 21 | +jobs: |
| 22 | + gpu-hvd-tests: |
| 23 | + strategy: |
| 24 | + matrix: |
| 25 | + pytorch-channel: [pytorch, ] |
| 26 | + fail-fast: false |
| 27 | + env: |
| 28 | + DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8" |
| 29 | + REPOSITORY: ${{ github.repository }} |
| 30 | + PR_NUMBER: ${{ github.event.pull_request.number }} |
| 31 | + runs-on: linux.8xlarge.nvidia.gpu |
| 32 | + timeout-minutes: 60 |
| 33 | + |
| 34 | + steps: |
| 35 | + - name: Clean workspace |
| 36 | + run: | |
| 37 | + echo "::group::Cleanup debug output" |
| 38 | + sudo rm -rfv "${GITHUB_WORKSPACE}" |
| 39 | + mkdir -p "${GITHUB_WORKSPACE}" |
| 40 | + echo "::endgroup::" |
| 41 | +
|
| 42 | + - name: Checkout repository (pytorch/test-infra) |
| 43 | + uses: actions/checkout@v3 |
| 44 | + with: |
| 45 | + # Support the use case where we need to checkout someone's fork |
| 46 | + repository: pytorch/test-infra |
| 47 | + path: test-infra |
| 48 | + |
| 49 | + - name: Setup Linux |
| 50 | + uses: ./test-infra/.github/actions/setup-linux |
| 51 | + |
| 52 | + - name: Pull docker image |
| 53 | + uses: ./test-infra/.github/actions/pull-docker-image |
| 54 | + with: |
| 55 | + docker-image: ${{ env.DOCKER_IMAGE }} |
| 56 | + |
| 57 | + - name: Checkout repository (${{ github.repository }}) |
| 58 | + uses: actions/checkout@v3 |
| 59 | + with: |
| 60 | + # Support the use case where we need to checkout someone's fork |
| 61 | + repository: ${{ github.repository }} |
| 62 | + ref: ${{ github.ref }} |
| 63 | + path: ${{ github.repository }} |
| 64 | + fetch-depth: 1 |
| 65 | + |
| 66 | + - name: Start Pytorch container |
| 67 | + working-directory: ${{ github.repository }} |
| 68 | + run: | |
| 69 | + docker run --name pthd --gpus=all --rm \ |
| 70 | + --cap-add=SYS_PTRACE \ |
| 71 | + --detach \ |
| 72 | + --ipc=host \ |
| 73 | + --security-opt seccomp=unconfined \ |
| 74 | + --shm-size=2g \ |
| 75 | + --tty \ |
| 76 | + --ulimit stack=10485760:83886080 \ |
| 77 | + -v $PWD:/work \ |
| 78 | + -w /work \ |
| 79 | + ${DOCKER_IMAGE} |
| 80 | +
|
| 81 | + script=$(cat << EOF |
| 82 | +
|
| 83 | + set -xe |
| 84 | +
|
| 85 | + nvidia-smi |
| 86 | + ls -alh |
| 87 | +
|
| 88 | + conda --version |
| 89 | + python --version |
| 90 | +
|
| 91 | + EOF |
| 92 | + ) |
| 93 | + docker exec -t pthd /bin/bash -c "${script}" |
| 94 | +
|
| 95 | + - name: Install PyTorch and dependencies |
| 96 | + continue-on-error: false |
| 97 | + run: | |
| 98 | +
|
| 99 | + script=$(cat << EOF |
| 100 | +
|
| 101 | + set -xe |
| 102 | +
|
| 103 | + # Install PyTorch |
| 104 | + if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then |
| 105 | + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118 |
| 106 | + else |
| 107 | + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 |
| 108 | + fi |
| 109 | +
|
| 110 | + python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" |
| 111 | + pip list |
| 112 | +
|
| 113 | + # Install dependencies |
| 114 | + pip install -r requirements-dev.txt |
| 115 | + pip install -e . |
| 116 | +
|
| 117 | + EOF |
| 118 | + ) |
| 119 | +
|
| 120 | + docker exec -t pthd /bin/bash -c "${script}" |
| 121 | +
|
| 122 | + - name: Install Horovod with NCCL GPU ops |
| 123 | + run: | |
| 124 | + script=$(cat << EOF |
| 125 | +
|
| 126 | + set -xe |
| 127 | +
|
| 128 | + HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch] |
| 129 | + horovodrun --check-build |
| 130 | + pip list |
| 131 | +
|
| 132 | + EOF |
| 133 | + ) |
| 134 | +
|
| 135 | + docker exec -t pthd /bin/bash -c "${script}" |
| 136 | +
|
| 137 | + - name: Run GPU and CPU Unit HVD Tests |
| 138 | + run: | |
| 139 | +
|
| 140 | + script=$(cat << EOF |
| 141 | +
|
| 142 | + set -xe |
| 143 | +
|
| 144 | + bash tests/run_gpu_tests.sh 2 hvd |
| 145 | + CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd |
| 146 | +
|
| 147 | + EOF |
| 148 | + ) |
| 149 | +
|
| 150 | + docker exec -t pthd /bin/bash -c "${script}" |
| 151 | +
|
| 152 | + - name: Upload coverage to Codecov |
| 153 | + uses: codecov/codecov-action@v3 |
| 154 | + with: |
| 155 | + file: ${{ github.repository }}/coverage.xml |
| 156 | + flags: gpu-2 |
| 157 | + fail_ci_if_error: false |
| 158 | + |
| 159 | + - name: Run examples in container |
| 160 | + continue-on-error: false |
| 161 | + run: | |
| 162 | + SCRIPT=$(cat << EOF |
| 163 | +
|
| 164 | + set -xe |
| 165 | +
|
| 166 | + # Install additional example dependencies |
| 167 | + pip install fire |
| 168 | +
|
| 169 | + # Check training on CIFAR10, run with horovod backend using horovodrun |
| 170 | + # initial run |
| 171 | + CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500 |
| 172 | + # resume |
| 173 | + CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt |
| 174 | +
|
| 175 | + # Check training on CIFAR10 using spawn |
| 176 | + # initial run |
| 177 | + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 |
| 178 | + # resume |
| 179 | + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt |
| 180 | +
|
| 181 | + EOF |
| 182 | + ) |
| 183 | +
|
| 184 | + docker exec -t pthd /bin/bash -c "${script}" |
| 185 | +
|
| 186 | + - name: Teardown Linux |
| 187 | + if: ${{ always() }} |
| 188 | + uses: ./test-infra/.github/actions/teardown-linux |
0 commit comments