Skip to content

Commit 27e2b7f

Browse files
tchatonBorda
andauthored
Mnodes (#5020)
* add a new workflow * update workflow * rmane * install * push * push * push * push * push * . * . * . * . * . * update * i * update * update * update * update * commit * update * update * update * update * update * update path * update * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * i * update * i * i * i * i * i * i * update * trigger * trigger * j * j * j * j * j * j * j * j * j * j Co-authored-by: Jirka Borovec <[email protected]>
1 parent 7abd822 commit 27e2b7f

File tree

3 files changed

+339
-0
lines changed

3 files changed

+339
-0
lines changed

.github/workflows/ci_test-mnodes.yml

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
name: Multi Nodes GPU Tests
2+
3+
# Workflow Steps:
4+
# 1. Checkout Pytorch Lightning
5+
# 2. Set up Python
6+
# 3. Configure AWS Credentials
7+
# 4. Install AWS Client
8+
# 5. Get Current Sha Commit
9+
# 6. Create Job Name
10+
# 7. Update Test Configuration File
11+
# 8. Install EKSClient
12+
# 9. Create Gpu Node Pool
13+
# 10. Check Current Node Pool | Current Elatic Pods
14+
# 11. Apply Elastic
15+
# 12. Wait 5 sec
16+
# 13. Find ETCD TCP Address
17+
# 14. Update Test Configuration File
18+
# 15. Apply Multi Node Testing
19+
# 16. Wait 120 secs
20+
# 17. Listen to Jobs Logging
21+
# 18. Statistics
22+
# 19. Upload coverage results
23+
# 20. Upload coverage to Codecov
24+
# 21. Delete Group Node
25+
26+
#on: push
27+
28+
on:
29+
push:
30+
branches:
31+
- master
32+
- release/*
33+
pull_request:
34+
types: [closed]
35+
36+
env:
37+
AWS_CLUSTER: pl-lightning-torchelastic
38+
NODE_TYPE: g4dn.xlarge
39+
NODES: 2
40+
NUM_GPUS: 1
41+
REGION: us-east-2
42+
MAX_CHECKS: 300
43+
CHECK_SPEEP: 2
44+
45+
jobs:
46+
multi-nodes-gpu-testing:
47+
runs-on: ubuntu-20.04
48+
strategy:
49+
fail-fast: false
50+
matrix:
51+
python-version: [3.7]
52+
pytorch-version: [1.5]
53+
# Timeout: https://stackoverflow.com/a/59076067/4521646
54+
timeout-minutes: 50
55+
56+
# runs only when merged happened.
57+
# if: github.event.pull_request.merged == true
58+
steps:
59+
60+
- name: Checkout Pytorch Lightning
61+
uses: actions/checkout@v2
62+
with:
63+
repository: PyTorchLightning/pytorch-lightning
64+
ref: ${{ github.event.base_ref }}
65+
66+
- name: Set up Python
67+
uses: actions/setup-python@v2
68+
with:
69+
python-version: ${{ matrix.python-version }}
70+
71+
# Note: This uses an internal pip API and may not always work
72+
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
73+
- name: Cache pip
74+
uses: actions/cache@v2
75+
with:
76+
path: ~/.cache/pip
77+
key: ${{ runner.os }}-pip-multi-node
78+
restore-keys: |
79+
${{ runner.os }}-pip-
80+
81+
- name: Install dependencies
82+
run: |
83+
pip install awscli coverage
84+
# todo
85+
pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/[email protected] -q --no-cache-dir
86+
#pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir
87+
88+
- name: Configure AWS Credentials
89+
uses: aws-actions/configure-aws-credentials@v1
90+
with:
91+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
92+
aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
93+
aws-region: us-east-2
94+
95+
- name: Get Current Sha Commit
96+
id: vars
97+
shell: bash
98+
run: |
99+
echo "::set-output name=SHA::$(git rev-parse --short HEAD)"
100+
echo $PWD
101+
102+
- name: Create Job Name
103+
id: job
104+
shell: bash
105+
run: |
106+
echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )"
107+
echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )"
108+
109+
- name: Install EKSClient
110+
run: |
111+
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
112+
sudo mv /tmp/eksctl /usr/local/bin
113+
shell: bash
114+
115+
- name: Create Gpu Node Pool
116+
run: |
117+
aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER
118+
eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES
119+
# eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES
120+
shell: bash
121+
122+
- name: Check Current Node Pool | Current Elatic Pods
123+
run: |
124+
eksctl get nodegroups --cluster $AWS_CLUSTER
125+
kubectl get pods -n elastic-job
126+
127+
- name: Apply Elastic
128+
run: |
129+
git clone https://github.com/pytorch/elastic.git
130+
cd elastic/kubernetes
131+
132+
kubectl apply -k config/default
133+
134+
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml
135+
kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml
136+
137+
- name: Wait
138+
# todo: this shall be dynamic
139+
if: always()
140+
shell: bash
141+
run: |
142+
sleep 5
143+
144+
- name: Find ETCD TCP Address
145+
id: tcp
146+
shell: bash
147+
run: |
148+
echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)"
149+
150+
- name: Update Test Config. File
151+
run: |
152+
import os
153+
from dtrun.configs import prepare_multi_nodes_gpu_config
154+
155+
assert os.path.isfile('./tests/mnode_tests.txt')
156+
prepare_multi_nodes_gpu_config(
157+
'./.github/multi-nodes-gpu.yaml',
158+
'./tests/mnode_tests.txt',
159+
sha="${{ steps.vars.outputs.SHA }}",
160+
tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}",
161+
python_version="${{ matrix.python-version }}",
162+
torch_version="${{ matrix.pytorch-version }}",
163+
num_gpus=1,
164+
)
165+
shell: python
166+
167+
- name: Apply Multi Node Testing
168+
run: |
169+
# cat ./.github/multi-nodes-gpu.yaml
170+
kubectl apply -f ./.github/multi-nodes-gpu.yaml
171+
shell: bash
172+
173+
- name: Wait
174+
# todo: this shall be dynamic
175+
if: always()
176+
shell: bash
177+
run: |
178+
sleep 400
179+
180+
- name: Listen to Jobs Logging
181+
shell: bash
182+
run: |
183+
# todo: Enable automatic checking.
184+
# while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
185+
# echo "Done waiting. Job status code: $status_code" && \
186+
kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt
187+
if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \
188+
cat xx00
189+
190+
- name: Statistics
191+
if: success()
192+
run: |
193+
cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage
194+
cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml
195+
196+
- name: Upload coverage to Codecov
197+
uses: codecov/codecov-action@v1
198+
if: always()
199+
# see: https://github.com/actions/toolkit/issues/399
200+
continue-on-error: true
201+
with:
202+
token: ${{ secrets.CODECOV_TOKEN }}
203+
file: coverage.xml
204+
flags: multi-nodes,pytest
205+
name: multi-nodes-coverage
206+
fail_ci_if_error: false
207+
208+
- name: Delete Group Node
209+
if: always()
210+
run: |
211+
kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job
212+
eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER
+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Copyright The PyTorch Lightning team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
import sys
16+
17+
import pytest
18+
import torch
19+
20+
ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
21+
sys.path.insert(0, ROOT)
22+
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
23+
24+
from pytorch_lightning import LightningModule, Trainer # noqa: E402
25+
from tests.base.boring_model import BoringModel # noqa: E402
26+
27+
28+
@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest")
29+
def test_logging_sync_dist_true_ddp(tmpdir):
30+
"""
31+
Tests to ensure that the sync_dist flag works with CPU (should just return the original value)
32+
"""
33+
fake_result = 1
34+
35+
class TestModel(BoringModel):
36+
def training_step(self, batch, batch_idx):
37+
acc = self.step(batch[0])
38+
self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True)
39+
return acc
40+
41+
def validation_step(self, batch, batch_idx):
42+
output = self.layer(batch)
43+
loss = self.loss(batch, output)
44+
self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True)
45+
return {"x": loss}
46+
47+
model = TestModel()
48+
trainer = Trainer(
49+
default_root_dir=tmpdir,
50+
limit_train_batches=1,
51+
limit_val_batches=1,
52+
max_epochs=2,
53+
weights_summary=None,
54+
accelerator="ddp",
55+
gpus=1,
56+
num_nodes=2,
57+
)
58+
trainer.fit(model)
59+
60+
assert trainer.logged_metrics['foo'] == fake_result
61+
assert trainer.logged_metrics['bar'] == fake_result
62+
63+
64+
@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest")
65+
def test__validation_step__log(tmpdir):
66+
"""
67+
Tests that validation_step can log
68+
"""
69+
os.environ['PL_DEV_DEBUG'] = '1'
70+
71+
class TestModel(BoringModel):
72+
def training_step(self, batch, batch_idx):
73+
acc = self.step(batch)
74+
acc = acc + batch_idx
75+
self.log('a', acc, on_step=True, on_epoch=True)
76+
self.log('a2', 2)
77+
78+
self.training_step_called = True
79+
return acc
80+
81+
def validation_step(self, batch, batch_idx):
82+
acc = self.step(batch)
83+
acc = acc + batch_idx
84+
self.log('b', acc, on_step=True, on_epoch=True)
85+
self.training_step_called = True
86+
87+
def backward(self, loss, optimizer, optimizer_idx):
88+
return LightningModule.backward(self, loss, optimizer, optimizer_idx)
89+
90+
model = TestModel()
91+
model.validation_step_end = None
92+
model.validation_epoch_end = None
93+
94+
trainer = Trainer(
95+
default_root_dir=tmpdir,
96+
limit_train_batches=2,
97+
limit_val_batches=2,
98+
max_epochs=2,
99+
log_every_n_steps=1,
100+
weights_summary=None,
101+
accelerator="ddp",
102+
gpus=1,
103+
num_nodes=2,
104+
)
105+
trainer.fit(model)
106+
107+
# make sure all the metrics are available for callbacks
108+
expected_logged_metrics = {
109+
'a2',
110+
'a_step',
111+
'a_epoch',
112+
'b_step/epoch_0',
113+
'b_step/epoch_1',
114+
'b_epoch',
115+
'epoch',
116+
}
117+
logged_metrics = set(trainer.logged_metrics.keys())
118+
assert expected_logged_metrics == logged_metrics
119+
120+
# we don't want to enable val metrics during steps because it is not something that users should do
121+
# on purpose DO NOT allow step_b... it's silly to monitor val step metrics
122+
callback_metrics = set(trainer.callback_metrics.keys())
123+
callback_metrics.remove('debug_epoch')
124+
expected_cb_metrics = {'a', 'a2', 'b', 'a_epoch', 'b_epoch', 'a_step'}
125+
assert expected_cb_metrics == callback_metrics

tests/mnode_tests.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
./tests/backends/test_multi_nodes_gpu.py::test_logging_sync_dist_true_ddp
2+
./tests/backends/test_multi_nodes_gpu.py::test__validation_step__log

0 commit comments

Comments
 (0)