Skip to content

Commit 97fe9d1

Browse files
author
Elad Ben-Israel
committed
fix(eks): version update completes prematurely
The `UpdateClusterVersion` operation takes a while to begin and until then, the cluster's status is still `ACTIVE` instead `UPDATING` as expected. This causes the `isComplete` handler, which is called immediately, to think that the operation is complete, when it hasn't even began. Add logic to the cluster version update `onEvent` method to wait up to 5 minutes until the cluster status is no longer `ACTIVE`, so that the subsequent `isComplete` query will be based on the version update operation itself. Extended the timeout of `onEvent` to 15m to ensure it does not interrupt the operation. TESTING: Updated unit tests to verify this retry behavior and performed a manual upgrade tests while examining the logs. Fixes #7457
1 parent ac3b330 commit 97fe9d1

File tree

4 files changed

+53
-2
lines changed

4 files changed

+53
-2
lines changed

packages/@aws-cdk/aws-eks/lib/cluster-resource-handler/cluster.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,36 @@ export class ClusterResourceHandler extends ResourceHandler {
159159
}
160160

161161
await this.eks.updateClusterVersion({ name: this.clusterName, version: newVersion });
162+
163+
// it take a while for the cluster to start the version update, and until
164+
// then the cluster's status is still "ACTIVE", which causes version
165+
// upgrades to complete prematurely. so, we wait here until the cluster
166+
// status changes status from "ACTIVE" and only then yield execution to the
167+
// async waiter. technically the status is expected to be "UPDATING", but it
168+
// is more robust to just make sure it's not "ACTIVE" before we carry on
169+
170+
// wait a total of 5 minutes for this to happen.
171+
let remainingSec = 5 * 60;
172+
173+
while (remainingSec > 0) {
174+
console.log(`waiting for cluster to transition from ACTIVE status (remaining time: ${remainingSec}s)`);
175+
const resp = await this.eks.describeCluster({ name: this.clusterName });
176+
console.log('describeCluster result:', JSON.stringify(resp, undefined, 2));
177+
178+
const status = resp.cluster?.status;
179+
if (status !== 'ACTIVE') {
180+
console.log(`cluster is now in ${status} state`);
181+
break;
182+
}
183+
184+
// wait 2sec before trying again
185+
await sleep(2000);
186+
remainingSec -= 2;
187+
}
188+
189+
if (remainingSec === 0) {
190+
throw new Error('version update failure: cluster did not transition from ACTIVE status after 5 minutes elapsed');
191+
}
162192
}
163193

164194
private async isActive(): Promise<IsCompleteResponse> {
@@ -227,3 +257,8 @@ function analyzeUpdate(oldProps: Partial<aws.EKS.CreateClusterRequest>, newProps
227257
updateLogging: JSON.stringify(newProps.logging) !== JSON.stringify(oldProps.logging),
228258
};
229259
}
260+
261+
async function sleep(ms: number) {
262+
console.log(`waiting ${ms} milliseconds`);
263+
return new Promise(ok => setTimeout(ok, ms));
264+
}

packages/@aws-cdk/aws-eks/lib/cluster-resource-provider.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ export class ClusterResourceProvider extends NestedStack {
4141
description: 'onEvent handler for EKS cluster resource provider',
4242
runtime: HANDLER_RUNTIME,
4343
handler: 'index.onEvent',
44-
timeout: Duration.minutes(1),
44+
timeout: Duration.minutes(15),
4545
});
4646

4747
const isComplete = new lambda.Function(this, 'IsCompleteHandler', {

packages/@aws-cdk/aws-eks/lib/cluster.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,6 @@ export enum MachineImageType {
10781078

10791079
const GPU_INSTANCETYPES = ['p2', 'p3', 'g4'];
10801080

1081-
export function nodeTypeForInstanceType(instanceType: ec2.InstanceType) {
1081+
function nodeTypeForInstanceType(instanceType: ec2.InstanceType) {
10821082
return GPU_INSTANCETYPES.includes(instanceType.toString().substring(0, 2)) ? NodeType.GPU : NodeType.STANDARD;
10831083
}

packages/@aws-cdk/aws-eks/test/test.cluster-resource-provider.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,13 +361,21 @@ export = {
361361
}, {
362362
version: undefined,
363363
}));
364+
365+
let count = 3;
366+
const restore = mocks.client.describeCluster;
367+
mocks.client.describeCluster = async () => ({
368+
cluster: { status: --count === 0 ? 'UPDATING' : 'ACTIVE' },
369+
});
364370
const resp = await handler.onEvent();
371+
mocks.client.describeCluster = restore;
365372
test.equal(resp, undefined);
366373
test.deepEqual(mocks.actualRequest.updateClusterVersionRequest!, {
367374
name: 'physical-resource-id',
368375
version: '12.34',
369376
});
370377
test.equal(mocks.actualRequest.createClusterRequest, undefined);
378+
test.equal(count, 0); // make sure "describeCluster" was called until it returned 'UPDATING'
371379
test.done();
372380
},
373381

@@ -377,13 +385,21 @@ export = {
377385
}, {
378386
version: '1.1',
379387
}));
388+
389+
let count = 3;
390+
const restore = mocks.client.describeCluster;
391+
mocks.client.describeCluster = async () => ({
392+
cluster: { status: --count === 0 ? 'UPDATING' : 'ACTIVE' },
393+
});
380394
const resp = await handler.onEvent();
395+
mocks.client.describeCluster = restore;
381396
test.equal(resp, undefined);
382397
test.deepEqual(mocks.actualRequest.updateClusterVersionRequest!, {
383398
name: 'physical-resource-id',
384399
version: '2.0',
385400
});
386401
test.equal(mocks.actualRequest.createClusterRequest, undefined);
402+
test.equal(count, 0); // make sure "describeCluster" was called until it returned 'UPDATING'
387403
test.done();
388404
},
389405

0 commit comments

Comments
 (0)