From 50ad3e20b70eebbf39eda0898c40f9e587040afb Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Fri, 25 Apr 2025 16:20:54 +0800 Subject: [PATCH 1/3] added retry times and two new retry reasons --- br/pkg/restore/split/client.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index 18bb63b9b59b7..6a1019cd5331a 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -200,15 +200,16 @@ func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) log.Warn("failed to batch scatter regions, rollback to sequentially scatter", logutil.ShortError(err)) c.scatterRegionsSequentially( ctx, newRegions, - // backoff about 6s, or we give up scattering this region. + // backoff about 100s total, or we give up scattering this region. &ExponentialBackoffer{ - Attempts: 7, + Attempts: 50, BaseBackoff: 100 * time.Millisecond, + MaxDelay: 2 * time.Second, }) return nil } return err - }, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond}) + }, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond, MaxDelay: 2 * time.Second}) } func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) error { @@ -1008,6 +1009,7 @@ func CheckRegionEpoch(_new, _old *RegionInfo) bool { type ExponentialBackoffer struct { Attempts int BaseBackoff time.Duration + MaxDelay time.Duration } func (b *ExponentialBackoffer) exponentialBackoff() time.Duration { @@ -1017,6 +1019,9 @@ func (b *ExponentialBackoffer) exponentialBackoff() time.Duration { return 0 } b.BaseBackoff *= 2 + if b.MaxDelay > 0 && b.BaseBackoff > b.MaxDelay { + b.BaseBackoff = b.MaxDelay + } return bo } @@ -1029,12 +1034,17 @@ func PdErrorCanRetry(err error) bool { // // (2) shouldn't happen in a recently splitted region. // (1) and (3) might happen, and should be retried. + // + // (4) operator canceled because cannot add an operator to the execute queue [PD:store-limit] + // (5) failed to create scatter region operator [PD:schedule:ErrCreateOperator] grpcErr := status.Convert(err) if grpcErr == nil { return false } return strings.Contains(grpcErr.Message(), "is not fully replicated") || - strings.Contains(grpcErr.Message(), "has no leader") + strings.Contains(grpcErr.Message(), "has no leader") || + strings.Contains(grpcErr.Message(), "cannot add an operator to the execute queue") || + strings.Contains(grpcErr.Message(), "failed to create scatter region operator") } // NextBackoff returns a duration to wait before retrying again. From 61f42d0b1884497ea635bf557ce4b9f339d0716f Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Fri, 25 Apr 2025 16:47:08 +0800 Subject: [PATCH 2/3] add ut --- br/pkg/restore/split/BUILD.bazel | 2 +- br/pkg/restore/split/client_test.go | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/br/pkg/restore/split/BUILD.bazel b/br/pkg/restore/split/BUILD.bazel index 448f67d62a579..17c89399fbacd 100644 --- a/br/pkg/restore/split/BUILD.bazel +++ b/br/pkg/restore/split/BUILD.bazel @@ -58,7 +58,7 @@ go_test( ], embed = [":split"], flaky = True, - shard_count = 18, + shard_count = 19, deps = [ "//br/pkg/errors", "//br/pkg/utils", diff --git a/br/pkg/restore/split/client_test.go b/br/pkg/restore/split/client_test.go index 432424960529b..7579797867300 100644 --- a/br/pkg/restore/split/client_test.go +++ b/br/pkg/restore/split/client_test.go @@ -14,6 +14,8 @@ import ( "github.com/pingcap/tidb/pkg/types" "github.com/pingcap/tidb/pkg/util/codec" "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) func TestBatchSplit(t *testing.T) { @@ -266,3 +268,21 @@ func TestSplitMeetErrorAndRetry(t *testing.T) { _, err = mockClient.SplitKeysAndScatter(ctx, [][]byte{{'d'}}) require.ErrorContains(t, err, "no valid key") } + +func TestPDErrorCanRetry(t *testing.T) { + // non-gRPC error should not retry + err := errors.New("random failure") + require.False(t, PdErrorCanRetry(err)) + + e1 := status.Error(codes.Unknown, "region 42 is not fully replicated") + require.True(t, PdErrorCanRetry(e1)) + + e2 := status.Error(codes.Unknown, "operator canceled because cannot add an operator to the execute queue") + require.True(t, PdErrorCanRetry(e2)) + + e3 := status.Error(codes.Unknown, "unable to create operator, failed to create scatter region operator for region 13813282") + require.True(t, PdErrorCanRetry(e3)) + + e4 := status.Error(codes.Unknown, "should be false") + require.False(t, PdErrorCanRetry(e4)) +} From 5c3e5821546c3d3f1a2d521eb51d020a8052b026 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Fri, 25 Apr 2025 17:26:00 +0800 Subject: [PATCH 3/3] add retry times to 1800 --- br/pkg/restore/split/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index 6a1019cd5331a..630da513f523e 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -200,9 +200,9 @@ func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) log.Warn("failed to batch scatter regions, rollback to sequentially scatter", logutil.ShortError(err)) c.scatterRegionsSequentially( ctx, newRegions, - // backoff about 100s total, or we give up scattering this region. + // backoff about 1h total, or we give up scattering this region. &ExponentialBackoffer{ - Attempts: 50, + Attempts: 1800, BaseBackoff: 100 * time.Millisecond, MaxDelay: 2 * time.Second, })