-
Notifications
You must be signed in to change notification settings - Fork 220
144 lines (136 loc) · 5.66 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
name: Run all tests
# enable to manually trigger the tests
#on: workflow_dispatch
on:
push:
branches:
- "*"
paths:
- "megatron/**"
- "tests/**"
- ".github/**"
- "tools/**"
- "*.py"
jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
# don't use the following subnets as p3.8xlarge is not supported there:
# - subnet-06576a4b # us-east-1d
# - subnet-859322b4 # us-east-1e
# - subnet-47cfad21 # us-east-1b
- name: Try to start EC2 runner (a)
id: try-us-east-1a
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-03540b272db1624b7
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-b7533b96 # us-east-1c
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (b)
id: try-us-east-1b
if: steps.try-us-east-1a.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
continue-on-error: true
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-03540b272db1624b7
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-a396b2ad # us-east-1f
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: Try to start EC2 runner (c)
id: try-us-east-1c
if: steps.try-us-east-1b.outcome == 'failure'
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-03540b272db1624b7
ec2-instance-type: p3.8xlarge
security-group-id: sg-f2a4e2fc
subnet-id: subnet-df0f6180 # us-east-1a
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
- name: See if any of 3 sub-regions had the resource
id: start-ec2-runner
run: |
if [ "${{ steps.try-us-east-1a.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1a.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1a.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1b.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1b.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1b.outputs.ec2-instance-id }}"
fi
if [ "${{ steps.try-us-east-1c.outcome }}" = "success" ]; then
echo "::set-output name=label::${{ steps.try-us-east-1c.outputs.label }}"
echo "::set-output name=ec2-instance-id::${{ steps.try-us-east-1c.outputs.ec2-instance-id }}"
fi
do-the-job:
name: Do the job on the runner
needs: start-runner # required to start the main job when the runner is ready
# need to figure out how to cancel the previous build if a new push was made the old test is still running
# concurrency: # cancel previous build on a new push
# group: ${{ github.ref }} # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#github-context
# cancel-in-progress: true
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
steps:
- name: NVIDIA-SMI
run: nvidia-smi
- name: Checkout
uses: actions/checkout@v2
- name: Install Dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install pytest-timeout
- name: Run tests
run: pytest --timeout=300 tests -sv
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner # required to get output from the start-runner job
- do-the-job # required to wait when the main job is done
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}