Skip to content

Commit

Permalink
fix(storage): retry gRPC DEADLINE_EXCEEDED errors (#10635)
Browse files Browse the repository at this point in the history
* fix(storage): retry gRPC DEADLINE_EXCEEDED errors

GCS is in some cases returning these for internal timeouts, so they
need to be retried. I added an extra context error check to our
retry logic to ensure that they can always be distinguished from
user-set deadlines as well as some tests.

See internal bug b/

* add comment explaining error
  • Loading branch information
tritone authored Aug 6, 2024
1 parent baa1c40 commit 0018415
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 3 deletions.
38 changes: 38 additions & 0 deletions storage/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/googleapis/gax-go/v2/callctx"
"google.golang.org/api/iterator"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

var emulatorClients map[string]storageClient
Expand Down Expand Up @@ -1415,6 +1416,43 @@ func TestRetryMaxAttemptsEmulated(t *testing.T) {
})
}

// Test that a timeout returns a DeadlineExceeded error, in spite of DeadlineExceeded being a retryable
// status when it is returned by the server.
func TestTimeoutErrorEmulated(t *testing.T) {
transportClientTest(t, func(t *testing.T, project, bucket string, client storageClient) {
ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond)
defer cancel()
time.Sleep(5 * time.Nanosecond)
config := &retryConfig{backoff: &gax.Backoff{Initial: 10 * time.Millisecond}}
_, err := client.GetBucket(ctx, bucket, nil, idempotent(true), withRetryConfig(config))

// Error may come through as a context.DeadlineExceeded (HTTP) or status.DeadlineExceeded (gRPC)
if !(errors.Is(err, context.DeadlineExceeded) || status.Code(err) == codes.DeadlineExceeded) {
t.Errorf("GetBucket: got unexpected error %v; want DeadlineExceeded", err)
}

// Validate that error was not retried. If it was retried, that will be mentioned
// in the error string because of wrapping.
if strings.Contains(err.Error(), "retry") {
t.Errorf("GetBucket: got error %v, expected non-retried error", err)
}
})
}

// Test that server-side DEADLINE_EXCEEDED errors are retried as expected with gRPC.
func TestRetryDeadlineExceedeEmulated(t *testing.T) {
transportClientTest(t, func(t *testing.T, project, bucket string, client storageClient) {
ctx := context.Background()
instructions := map[string][]string{"storage.buckets.get": {"return-504", "return-504"}}
testID := createRetryTest(t, project, bucket, client, instructions)
ctx = callctx.SetHeaders(ctx, "x-retry-test-id", testID)
config := &retryConfig{maxAttempts: expectedAttempts(4), backoff: &gax.Backoff{Initial: 10 * time.Millisecond}}
if _, err := client.GetBucket(ctx, bucket, nil, idempotent(true), withRetryConfig(config)); err != nil {
t.Fatalf("GetBucket: got unexpected error %v, want nil", err)
}
})
}

// createRetryTest creates a bucket in the emulator and sets up a test using the
// Retry Test API for the given instructions. This is intended for emulator tests
// of retry behavior that are not covered by conformance tests.
Expand Down
14 changes: 11 additions & 3 deletions storage/invoke.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,15 @@ func run(ctx context.Context, call func(ctx context.Context) error, retry *retry
return true, fmt.Errorf("storage: retry failed after %v attempts; last error: %w", *retry.maxAttempts, err)
}
attempts++
return !errorFunc(err), err
retryable := errorFunc(err)
// Explicitly check context cancellation so that we can distinguish between a
// DEADLINE_EXCEEDED error from the server and a user-set context deadline.
// Unfortunately gRPC will codes.DeadlineExceeded (which may be retryable if it's
// sent by the server) in both cases.
if ctxErr := ctx.Err(); errors.Is(ctxErr, context.Canceled) || errors.Is(ctxErr, context.DeadlineExceeded) {
retryable = false
}
return !retryable, err
})
}

Expand Down Expand Up @@ -129,9 +137,9 @@ func ShouldRetry(err error) bool {
return true
}
}
// UNAVAILABLE, RESOURCE_EXHAUSTED, and INTERNAL codes are all retryable for gRPC.
// UNAVAILABLE, RESOURCE_EXHAUSTED, INTERNAL, and DEADLINE_EXCEEDED codes are all retryable for gRPC.
if st, ok := status.FromError(err); ok {
if code := st.Code(); code == codes.Unavailable || code == codes.ResourceExhausted || code == codes.Internal {
if code := st.Code(); code == codes.Unavailable || code == codes.ResourceExhausted || code == codes.Internal || code == codes.DeadlineExceeded {
return true
}
}
Expand Down

0 comments on commit 0018415

Please sign in to comment.