Skip to content

Commit f2b1183

Browse files
authored
Ported gpu hvd tests from circleci to GHA (#2619)
* Ported gpu hvd tests from circleci to GHA * Fixing HVD installation * WIP replaced pip with conda * Updates * Update gpu-hvd-tests.yml * Update gpu-hvd-tests.yml * More updates * Do not build with NCCL * Updated cache name * WIP * Use AGENT_TOOLSDIRECTORY for isolation * Fixing horovod installation * Added cmake and scikit-build step * Ported tests to pytorch infra * Added set -e to gpu-*tests.yml * Fixed issue with test__hvd_dist_model_warning_index_less_localrank
1 parent 11a1fba commit f2b1183

File tree

3 files changed

+197
-9
lines changed

3 files changed

+197
-9
lines changed

.github/workflows/gpu-hvd-tests.yml

+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
name: Run HVD-specific unit tests on GPUs
2+
on:
3+
push:
4+
paths:
5+
- "ignite/**"
6+
- "tests/ignite/**"
7+
- "tests/run_gpu_tests.sh"
8+
- "tests/run_code_style.sh"
9+
- "examples/**.py"
10+
- "requirements-dev.txt"
11+
- ".github/workflows/gpu-hvd-tests.yml"
12+
workflow_dispatch:
13+
14+
concurrency:
15+
# <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>
16+
group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
17+
cancel-in-progress: true
18+
19+
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
20+
21+
jobs:
22+
gpu-hvd-tests:
23+
strategy:
24+
matrix:
25+
pytorch-channel: [pytorch, ]
26+
fail-fast: false
27+
env:
28+
DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8"
29+
REPOSITORY: ${{ github.repository }}
30+
PR_NUMBER: ${{ github.event.pull_request.number }}
31+
runs-on: linux.8xlarge.nvidia.gpu
32+
timeout-minutes: 60
33+
34+
steps:
35+
- name: Clean workspace
36+
run: |
37+
echo "::group::Cleanup debug output"
38+
sudo rm -rfv "${GITHUB_WORKSPACE}"
39+
mkdir -p "${GITHUB_WORKSPACE}"
40+
echo "::endgroup::"
41+
42+
- name: Checkout repository (pytorch/test-infra)
43+
uses: actions/checkout@v3
44+
with:
45+
# Support the use case where we need to checkout someone's fork
46+
repository: pytorch/test-infra
47+
path: test-infra
48+
49+
- name: Setup Linux
50+
uses: ./test-infra/.github/actions/setup-linux
51+
52+
- name: Pull docker image
53+
uses: ./test-infra/.github/actions/pull-docker-image
54+
with:
55+
docker-image: ${{ env.DOCKER_IMAGE }}
56+
57+
- name: Checkout repository (${{ github.repository }})
58+
uses: actions/checkout@v3
59+
with:
60+
# Support the use case where we need to checkout someone's fork
61+
repository: ${{ github.repository }}
62+
ref: ${{ github.ref }}
63+
path: ${{ github.repository }}
64+
fetch-depth: 1
65+
66+
- name: Start Pytorch container
67+
working-directory: ${{ github.repository }}
68+
run: |
69+
docker run --name pthd --gpus=all --rm \
70+
--cap-add=SYS_PTRACE \
71+
--detach \
72+
--ipc=host \
73+
--security-opt seccomp=unconfined \
74+
--shm-size=2g \
75+
--tty \
76+
--ulimit stack=10485760:83886080 \
77+
-v $PWD:/work \
78+
-w /work \
79+
${DOCKER_IMAGE}
80+
81+
script=$(cat << EOF
82+
83+
set -xe
84+
85+
nvidia-smi
86+
ls -alh
87+
88+
conda --version
89+
python --version
90+
91+
EOF
92+
)
93+
docker exec -t pthd /bin/bash -c "${script}"
94+
95+
- name: Install PyTorch and dependencies
96+
continue-on-error: false
97+
run: |
98+
99+
script=$(cat << EOF
100+
101+
set -xe
102+
103+
# Install PyTorch
104+
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
105+
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118
106+
else
107+
pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
108+
fi
109+
110+
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
111+
pip list
112+
113+
# Install dependencies
114+
pip install -r requirements-dev.txt
115+
pip install -e .
116+
117+
EOF
118+
)
119+
120+
docker exec -t pthd /bin/bash -c "${script}"
121+
122+
- name: Install Horovod with NCCL GPU ops
123+
run: |
124+
script=$(cat << EOF
125+
126+
set -xe
127+
128+
HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
129+
horovodrun --check-build
130+
pip list
131+
132+
EOF
133+
)
134+
135+
docker exec -t pthd /bin/bash -c "${script}"
136+
137+
- name: Run GPU and CPU Unit HVD Tests
138+
run: |
139+
140+
script=$(cat << EOF
141+
142+
set -xe
143+
144+
bash tests/run_gpu_tests.sh 2 hvd
145+
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
146+
147+
EOF
148+
)
149+
150+
docker exec -t pthd /bin/bash -c "${script}"
151+
152+
- name: Upload coverage to Codecov
153+
uses: codecov/codecov-action@v3
154+
with:
155+
file: ${{ github.repository }}/coverage.xml
156+
flags: gpu-2
157+
fail_ci_if_error: false
158+
159+
- name: Run examples in container
160+
continue-on-error: false
161+
run: |
162+
SCRIPT=$(cat << EOF
163+
164+
set -xe
165+
166+
# Install additional example dependencies
167+
pip install fire
168+
169+
# Check training on CIFAR10, run with horovod backend using horovodrun
170+
# initial run
171+
CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500
172+
# resume
173+
CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
174+
175+
# Check training on CIFAR10 using spawn
176+
# initial run
177+
CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
178+
# resume
179+
CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
180+
181+
EOF
182+
)
183+
184+
docker exec -t pthd /bin/bash -c "${script}"
185+
186+
- name: Teardown Linux
187+
if: ${{ always() }}
188+
uses: ./test-infra/.github/actions/teardown-linux

.github/workflows/gpu-tests.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ concurrency:
1919
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
2020

2121
jobs:
22-
gpu-tests:
22+
gpu-tests:
2323
strategy:
2424
matrix:
2525
pytorch-channel: [pytorch, pytorch-nightly]
@@ -80,7 +80,7 @@ jobs:
8080
8181
script=$(cat << EOF
8282
83-
set -x
83+
set -xe
8484
8585
nvidia-smi
8686
ls -alh
@@ -98,7 +98,7 @@ jobs:
9898
9999
script=$(cat << EOF
100100
101-
set -x
101+
set -xe
102102
103103
# Install PyTorch
104104
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
@@ -119,13 +119,13 @@ jobs:
119119
120120
docker exec -t pthd /bin/bash -c "${script}"
121121
122-
- name: Run 1 Node 2 GPUs Unit Tests
122+
- name: Run GPU Unit Tests
123123
continue-on-error: false
124124
run: |
125125
126126
script=$(cat << EOF
127127
128-
set -x
128+
set -xe
129129
130130
bash tests/run_gpu_tests.sh 2
131131
@@ -145,8 +145,8 @@ jobs:
145145
continue-on-error: false
146146
run: |
147147
SCRIPT=$(cat << EOF
148-
149-
set -x
148+
149+
set -xe
150150
151151
# Install additional example dependencies
152152
pip install fire
@@ -156,7 +156,7 @@ jobs:
156156
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
157157
## resume
158158
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
159-
159+
160160
# Check training on cifar10, run with NCCL backend using torchrun
161161
## initial run
162162
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500

tests/ignite/distributed/comp_models/test_horovod.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank():
184184
@pytest.mark.distributed
185185
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
186186
def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor):
187-
gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count())
187+
gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count())
188188

189189

190190
def _test_dist_spawn_fn(local_rank, backend, world_size, device):

0 commit comments

Comments
 (0)