forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 56
220 lines (190 loc) · 9.25 KB
/
libcxx-restart-preempted-jobs.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
name: Restart Preempted Libc++ Workflow
# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
# This workflow identifies when a workflow run was canceled due to the VM being preempted,
# and restarts the workflow run.
# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
# which should contain the message "The runner has received a shutdown signal."
# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
on:
workflow_run:
workflows: [Build and Test libc\+\+]
types:
- completed
permissions:
contents: read
jobs:
restart:
if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled')
name: "Restart Job"
permissions:
statuses: read
checks: write
actions: write
runs-on: ubuntu-latest
steps:
- name: "Restart Job"
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
with:
script: |
const failure_regex = /Process completed with exit code 1./
const preemption_regex = /The runner has received a shutdown signal/
const wf_run = context.payload.workflow_run
core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
async function create_check_run(conclusion, message) {
// Create a check run on the given workflow run to indicate if
// we are restarting the workflow or not.
if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
core.setFailed('Invalid conclusion: ' + conclusion)
}
await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
name: 'Restart Preempted Job',
head_sha: wf_run.head_sha,
status: 'completed',
conclusion: conclusion,
output: {
title: 'Restarted Preempted Job',
summary: message
}
})
}
console.log('Listing check runs for suite')
const check_suites = await github.rest.checks.listForSuite({
owner: context.repo.owner,
repo: context.repo.repo,
check_suite_id: context.payload.workflow_run.check_suite_id,
per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
})
check_run_ids = [];
for (check_run of check_suites.data.check_runs) {
console.log('Checking check run: ' + check_run.id);
if (check_run.status != 'completed') {
console.log('Check run was not completed. Skipping.');
continue;
}
if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
continue;
}
check_run_ids.push(check_run.id);
}
has_preempted_job = false;
for (check_run_id of check_run_ids) {
console.log('Listing annotations for check run: ' + check_run_id);
annotations = await github.rest.checks.listAnnotations({
owner: context.repo.owner,
repo: context.repo.repo,
check_run_id: check_run_id
})
for (annotation of annotations.data) {
if (annotation.annotation_level != 'failure') {
continue;
}
const preemption_match = annotation.message.match(preemption_regex);
if (preemption_match != null) {
console.log('Found preemption message: ' + annotation.message);
has_preempted_job = true;
}
const failure_match = annotation.message.match(failure_regex);
if (failure_match != null) {
// We only want to restart the workflow if all of the failures were due to preemption.
// We don't want to restart the workflow if there were other failures.
core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
'Failure message: "' + annotation.message + '"');
await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
+ 'Failure message: ' + annotation.message)
return;
}
}
}
if (!has_preempted_job) {
core.notice('No preempted jobs found. Not restarting workflow.');
await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
return;
}
core.notice("Restarted workflow: " + context.payload.workflow_run.id);
await github.rest.actions.reRunWorkflowFailedJobs({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: context.payload.workflow_run.id
})
await create_check_run('success', 'Restarted workflow run due to preempted job')
restart-test:
if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled') && github.event.actor.login == 'ldionne' # TESTING ONLY
name: "Restart Job (test)"
permissions:
statuses: read
checks: write
actions: write
runs-on: ubuntu-latest
steps:
- name: "Restart Job (test)"
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
with:
script: |
const FAILURE_REGEX = /Process completed with exit code 1./
const PREEMPTION_REGEX = /(The runner has received a shutdown signal)|(The operation was canceled)/
function log(msg) {
core.notice(msg)
}
const wf_run = context.payload.workflow_run
log(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
log('Listing check runs for suite')
const check_suites = await github.rest.checks.listForSuite({
owner: context.repo.owner,
repo: context.repo.repo,
check_suite_id: context.payload.workflow_run.check_suite_id,
per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
})
preemptions = [];
legitimate_failures = [];
for (check_run of check_suites.data.check_runs) {
log(`Checking check run: ${check_run.id}`);
if (check_run.status != 'completed') {
log('Check run was not completed. Skipping.');
continue;
}
if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
log(`Check run had conclusion: ${check_run.conclusion}. Skipping.`);
continue;
}
annotations = await github.rest.checks.listAnnotations({
owner: context.repo.owner,
repo: context.repo.repo,
check_run_id: check_run.id
})
preemption_annotation = annotations.data.find(function(annotation) {
return annotation.annotation_level == 'failure' &&
annotation.message.match(PREEMPTION_REGEX) != null;
});
if (preemption_annotation != null) {
log(`Found preemption message: ${preemption_annotation.message}`);
preemptions.push(check_run);
break;
}
failure_annotation = annotations.data.find(function(annotation) {
return annotation.annotation_level == 'failure' &&
annotation.message.match(FAILURE_REGEX) != null;
});
if (failure_annotation != null) {
log(`Found legitimate failure annotation: ${failure_annotation.message}`);
legitimate_failures.push(check_run);
break;
}
}
if (preemptions) {
log('Found some preempted jobs');
if (legitimate_failures) {
log('Also found some legitimate failures, so not restarting the workflow.');
} else {
log('Did not find any legitimate failures. Restarting workflow.');
await github.rest.actions.reRunWorkflowFailedJobs({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: context.payload.workflow_run.id
})
}
} else {
log('Did not find any preempted jobs. Not restarting the workflow.');
}