Skip to content

Commit

Permalink
r/ecs_service: retry WaitUntilServicesStable up to 3 times
Browse files Browse the repository at this point in the history
  • Loading branch information
anGie44 committed May 4, 2022
1 parent 24faf1f commit 6ea6f18
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 23 deletions.
20 changes: 0 additions & 20 deletions internal/conns/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -400,26 +400,6 @@ func (c *Config) Client(ctx context.Context) (interface{}, diag.Diagnostics) {
}
})

client.ECSConn.Handlers.Retry.PushBack(func(r *request.Request) {
// By design the "WaitUntilServicesStable" method will poll every 15 seconds until a successful state
// has been reached. This will exit with a return code of 255 (ResourceNotReady) after 40 failed checks.
// Thus, here we retry the operation a set number of times as
// described in https://github.com/hashicorp/terraform-provider-aws/pull/23747.
if r.Operation.Name == "WaitUntilServicesStable" {
if tfawserr.ErrCodeEquals(r.Error, "ResourceNotReady") {
// We only want to retry briefly as the default max retry count would
// excessively retry when the error could be legitimate.
// We currently depend on the DefaultRetryer exponential backoff here.
// ~10 retries gives a fair backoff of a few seconds.
if r.RetryCount < 9 {
r.Retryable = aws.Bool(true)
} else {
r.Retryable = aws.Bool(false)
}
}
}
})

client.FMSConn.Handlers.Retry.PushBack(func(r *request.Request) {
// Acceptance testing creates and deletes resources in quick succession.
// The FMS onboarding process into Organizations is opaque to consumers.
Expand Down
25 changes: 22 additions & 3 deletions internal/service/ecs/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ecs

import (
"context"
"log"
"time"

"github.com/aws/aws-sdk-go/aws"
Expand All @@ -26,6 +27,8 @@ const (

taskSetCreateTimeout = 10 * time.Minute
taskSetDeleteTimeout = 10 * time.Minute

serviceStableRetryCount = 3
)

func waitCapacityProviderDeleted(conn *ecs.ECS, arn string) (*ecs.CapacityProvider, error) {
Expand Down Expand Up @@ -63,6 +66,7 @@ func waitCapacityProviderUpdated(conn *ecs.ECS, arn string) (*ecs.CapacityProvid
}

func waitServiceStable(conn *ecs.ECS, id, cluster string) error {
var err error
input := &ecs.DescribeServicesInput{
Services: aws.StringSlice([]string{id}),
}
Expand All @@ -71,10 +75,25 @@ func waitServiceStable(conn *ecs.ECS, id, cluster string) error {
input.Cluster = aws.String(cluster)
}

if err := conn.WaitUntilServicesStable(input); err != nil {
return err
// Here we retry the following operation a set number of times as
// described in https://github.com/hashicorp/terraform-provider-aws/pull/23747.
// Previously, handling was attempted in the ECSConn.Handlers in conns/config.go, but did not work as expected.
// Reference: https://github.com/hashicorp/terraform-provider-aws/pull/24223.
// The waiter within the "WaitUntilServicesStable" request will poll until the service is either
// in a failure state ('MISSING', 'DRAINING', or 'INACTIVE') or reaches the successful stable state.
// Reference: https://github.com/aws/aws-sdk-go/blob/f377248cbb3037d1989004ba26f6d73f620461df/service/ecs/waiters.go#L79-L105
// Thus, since the waiter will return an error when one of the 'MISSING', 'DRAINING', 'INACTIVE' states is met,
// we make up to 3 repetitive calls, hoping the service reaches a stable state by the end.
for i := 1; i <= serviceStableRetryCount; i++ {
log.Printf("[DEBUG] WaitUntilServicesStable attempt %d/%d", i, serviceStableRetryCount)
err = conn.WaitUntilServicesStable(input)
if err == nil {
return nil
}
log.Printf("[DEBUG] error received from WaitUntilServicesStable: %s", err)
}
return nil

return err
}

func waitServiceInactive(conn *ecs.ECS, id, cluster string) error {
Expand Down

0 comments on commit 6ea6f18

Please sign in to comment.