|
| 1 | +name: Multi Nodes GPU Tests |
| 2 | + |
| 3 | +# Workflow Steps: |
| 4 | +# 1. Checkout Pytorch Lightning |
| 5 | +# 2. Set up Python |
| 6 | +# 3. Configure AWS Credentials |
| 7 | +# 4. Install AWS Client |
| 8 | +# 5. Get Current Sha Commit |
| 9 | +# 6. Create Job Name |
| 10 | +# 7. Update Test Configuration File |
| 11 | +# 8. Install EKSClient |
| 12 | +# 9. Create Gpu Node Pool |
| 13 | +# 10. Check Current Node Pool | Current Elatic Pods |
| 14 | +# 11. Apply Elastic |
| 15 | +# 12. Wait 5 sec |
| 16 | +# 13. Find ETCD TCP Address |
| 17 | +# 14. Update Test Configuration File |
| 18 | +# 15. Apply Multi Node Testing |
| 19 | +# 16. Wait 120 secs |
| 20 | +# 17. Listen to Jobs Logging |
| 21 | +# 18. Statistics |
| 22 | +# 19. Upload coverage results |
| 23 | +# 20. Upload coverage to Codecov |
| 24 | +# 21. Delete Group Node |
| 25 | + |
| 26 | +#on: push |
| 27 | + |
| 28 | +on: |
| 29 | + push: |
| 30 | + branches: |
| 31 | + - master |
| 32 | + - release/* |
| 33 | + pull_request: |
| 34 | + types: [closed] |
| 35 | + |
| 36 | +env: |
| 37 | + AWS_CLUSTER: pl-lightning-torchelastic |
| 38 | + NODE_TYPE: g4dn.xlarge |
| 39 | + NODES: 2 |
| 40 | + NUM_GPUS: 1 |
| 41 | + REGION: us-east-2 |
| 42 | + MAX_CHECKS: 300 |
| 43 | + CHECK_SPEEP: 2 |
| 44 | + |
| 45 | +jobs: |
| 46 | + multi-nodes-gpu-testing: |
| 47 | + runs-on: ubuntu-20.04 |
| 48 | + strategy: |
| 49 | + fail-fast: false |
| 50 | + matrix: |
| 51 | + python-version: [3.7] |
| 52 | + pytorch-version: [1.5] |
| 53 | + # Timeout: https://stackoverflow.com/a/59076067/4521646 |
| 54 | + timeout-minutes: 50 |
| 55 | + |
| 56 | + # runs only when merged happened. |
| 57 | + # if: github.event.pull_request.merged == true |
| 58 | + steps: |
| 59 | + |
| 60 | + - name: Checkout Pytorch Lightning |
| 61 | + uses: actions/checkout@v2 |
| 62 | + with: |
| 63 | + repository: PyTorchLightning/pytorch-lightning |
| 64 | + ref: ${{ github.event.base_ref }} |
| 65 | + |
| 66 | + - name: Set up Python |
| 67 | + uses: actions/setup-python@v2 |
| 68 | + with: |
| 69 | + python-version: ${{ matrix.python-version }} |
| 70 | + |
| 71 | + # Note: This uses an internal pip API and may not always work |
| 72 | + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow |
| 73 | + - name: Cache pip |
| 74 | + uses: actions/cache@v2 |
| 75 | + with: |
| 76 | + path: ~/.cache/pip |
| 77 | + key: ${{ runner.os }}-pip-multi-node |
| 78 | + restore-keys: | |
| 79 | + ${{ runner.os }}-pip- |
| 80 | +
|
| 81 | + - name: Install dependencies |
| 82 | + run: | |
| 83 | + pip install awscli coverage |
| 84 | + # todo |
| 85 | + pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/[email protected] -q --no-cache-dir |
| 86 | + #pip install git+https://${{ secrets.PL_GHOST_TOKEN }}@github.com/PyTorchLightning/lightning-dtrun.git@mnodes -q --no-cache-dir |
| 87 | +
|
| 88 | + - name: Configure AWS Credentials |
| 89 | + uses: aws-actions/configure-aws-credentials@v1 |
| 90 | + with: |
| 91 | + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
| 92 | + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }} |
| 93 | + aws-region: us-east-2 |
| 94 | + |
| 95 | + - name: Get Current Sha Commit |
| 96 | + id: vars |
| 97 | + shell: bash |
| 98 | + run: | |
| 99 | + echo "::set-output name=SHA::$(git rev-parse --short HEAD)" |
| 100 | + echo $PWD |
| 101 | +
|
| 102 | + - name: Create Job Name |
| 103 | + id: job |
| 104 | + shell: bash |
| 105 | + run: | |
| 106 | + echo "::set-output name=ID::$(echo '${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}' | tr . - )" |
| 107 | + echo "::set-output name=ID_NAME::$(echo 's-${{ steps.vars.outputs.SHA }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-e' | tr . - )" |
| 108 | +
|
| 109 | + - name: Install EKSClient |
| 110 | + run: | |
| 111 | + curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp |
| 112 | + sudo mv /tmp/eksctl /usr/local/bin |
| 113 | + shell: bash |
| 114 | + |
| 115 | + - name: Create Gpu Node Pool |
| 116 | + run: | |
| 117 | + aws eks --region $REGION update-kubeconfig --name $AWS_CLUSTER |
| 118 | + eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --node-type=$NODE_TYPE --nodes=$NODES |
| 119 | + # eksctl create nodegroup --name=${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER --managed --spot --node-type=$NODE_TYPE --nodes=$NODES |
| 120 | + shell: bash |
| 121 | + |
| 122 | + - name: Check Current Node Pool | Current Elatic Pods |
| 123 | + run: | |
| 124 | + eksctl get nodegroups --cluster $AWS_CLUSTER |
| 125 | + kubectl get pods -n elastic-job |
| 126 | +
|
| 127 | + - name: Apply Elastic |
| 128 | + run: | |
| 129 | + git clone https://github.com/pytorch/elastic.git |
| 130 | + cd elastic/kubernetes |
| 131 | +
|
| 132 | + kubectl apply -k config/default |
| 133 | +
|
| 134 | + kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/master/nvidia-device-plugin.yml |
| 135 | + kubectl apply -f https://raw.githubusercontent.com/pytorch/elastic/master/kubernetes/config/samples/etcd.yaml |
| 136 | +
|
| 137 | + - name: Wait |
| 138 | + # todo: this shall be dynamic |
| 139 | + if: always() |
| 140 | + shell: bash |
| 141 | + run: | |
| 142 | + sleep 5 |
| 143 | +
|
| 144 | + - name: Find ETCD TCP Address |
| 145 | + id: tcp |
| 146 | + shell: bash |
| 147 | + run: | |
| 148 | + echo "::set-output name=TCP_ADDRESS::$(kubectl logs etcd -n elastic-job | grep -Eo '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,4}' | head -1)" |
| 149 | +
|
| 150 | + - name: Update Test Config. File |
| 151 | + run: | |
| 152 | + import os |
| 153 | + from dtrun.configs import prepare_multi_nodes_gpu_config |
| 154 | +
|
| 155 | + assert os.path.isfile('./tests/mnode_tests.txt') |
| 156 | + prepare_multi_nodes_gpu_config( |
| 157 | + './.github/multi-nodes-gpu.yaml', |
| 158 | + './tests/mnode_tests.txt', |
| 159 | + sha="${{ steps.vars.outputs.SHA }}", |
| 160 | + tcp_address="${{ steps.tcp.outputs.TCP_ADDRESS }}", |
| 161 | + python_version="${{ matrix.python-version }}", |
| 162 | + torch_version="${{ matrix.pytorch-version }}", |
| 163 | + num_gpus=1, |
| 164 | + ) |
| 165 | + shell: python |
| 166 | + |
| 167 | + - name: Apply Multi Node Testing |
| 168 | + run: | |
| 169 | + # cat ./.github/multi-nodes-gpu.yaml |
| 170 | + kubectl apply -f ./.github/multi-nodes-gpu.yaml |
| 171 | + shell: bash |
| 172 | + |
| 173 | + - name: Wait |
| 174 | + # todo: this shall be dynamic |
| 175 | + if: always() |
| 176 | + shell: bash |
| 177 | + run: | |
| 178 | + sleep 400 |
| 179 | +
|
| 180 | + - name: Listen to Jobs Logging |
| 181 | + shell: bash |
| 182 | + run: | |
| 183 | + # todo: Enable automatic checking. |
| 184 | + # while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job | grep -i "error\|failed"; then status_code=1 && break; elif kubectl logs ${{ steps.job.outputs.ID }}-worker-0 -n elastic-job | grep "TEST END"; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ |
| 185 | + # echo "Done waiting. Job status code: $status_code" && \ |
| 186 | + kubectl logs ${{ steps.job.outputs.ID_NAME }}-worker-0 -n elastic-job > /tmp/full_output.txt |
| 187 | + if grep -q 'END_TOKEN' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/END_TOKEN/'; else mv /tmp/full_output.txt xx00; fi && \ |
| 188 | + cat xx00 |
| 189 | +
|
| 190 | + - name: Statistics |
| 191 | + if: success() |
| 192 | + run: | |
| 193 | + cat ./xx01 | tail -n +2 | base64 --decode > /home/runner/work/pytorch-lightning/pytorch-lightning/.coverage |
| 194 | + cd /home/runner/work/pytorch-lightning/pytorch-lightning && coverage report && coverage xml |
| 195 | +
|
| 196 | + - name: Upload coverage to Codecov |
| 197 | + uses: codecov/codecov-action@v1 |
| 198 | + if: always() |
| 199 | + # see: https://github.com/actions/toolkit/issues/399 |
| 200 | + continue-on-error: true |
| 201 | + with: |
| 202 | + token: ${{ secrets.CODECOV_TOKEN }} |
| 203 | + file: coverage.xml |
| 204 | + flags: multi-nodes,pytest |
| 205 | + name: multi-nodes-coverage |
| 206 | + fail_ci_if_error: false |
| 207 | + |
| 208 | + - name: Delete Group Node |
| 209 | + if: always() |
| 210 | + run: | |
| 211 | + kubectl delete ElasticJob ${{ steps.job.outputs.ID_NAME }} -n elastic-job |
| 212 | + eksctl delete nodegroup ${{ steps.job.outputs.ID }} --cluster=$AWS_CLUSTER |
0 commit comments