Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion br/pkg/restore/split/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ go_test(
],
embed = [":split"],
flaky = True,
shard_count = 18,
shard_count = 19,
deps = [
"//br/pkg/errors",
"//br/pkg/utils",
Expand Down
18 changes: 14 additions & 4 deletions br/pkg/restore/split/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,16 @@ func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo)
log.Warn("failed to batch scatter regions, rollback to sequentially scatter", logutil.ShortError(err))
c.scatterRegionsSequentially(
ctx, newRegions,
// backoff about 6s, or we give up scattering this region.
// backoff about 1h total, or we give up scattering this region.
&ExponentialBackoffer{
Attempts: 7,
Attempts: 1800,
BaseBackoff: 100 * time.Millisecond,
MaxDelay: 2 * time.Second,
})
return nil
}
return err
}, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond})
}, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond, MaxDelay: 2 * time.Second})
}

func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) error {
Expand Down Expand Up @@ -1008,6 +1009,7 @@ func CheckRegionEpoch(_new, _old *RegionInfo) bool {
type ExponentialBackoffer struct {
Attempts int
BaseBackoff time.Duration
MaxDelay time.Duration
}

func (b *ExponentialBackoffer) exponentialBackoff() time.Duration {
Expand All @@ -1017,6 +1019,9 @@ func (b *ExponentialBackoffer) exponentialBackoff() time.Duration {
return 0
}
b.BaseBackoff *= 2
if b.MaxDelay > 0 && b.BaseBackoff > b.MaxDelay {
b.BaseBackoff = b.MaxDelay
}
return bo
}

Expand All @@ -1029,12 +1034,17 @@ func PdErrorCanRetry(err error) bool {
//
// (2) shouldn't happen in a recently splitted region.
// (1) and (3) might happen, and should be retried.
//
// (4) operator canceled because cannot add an operator to the execute queue [PD:store-limit]
// (5) failed to create scatter region operator [PD:schedule:ErrCreateOperator]
grpcErr := status.Convert(err)
if grpcErr == nil {
return false
}
return strings.Contains(grpcErr.Message(), "is not fully replicated") ||
strings.Contains(grpcErr.Message(), "has no leader")
strings.Contains(grpcErr.Message(), "has no leader") ||
strings.Contains(grpcErr.Message(), "cannot add an operator to the execute queue") ||
strings.Contains(grpcErr.Message(), "failed to create scatter region operator")
}

// NextBackoff returns a duration to wait before retrying again.
Expand Down
20 changes: 20 additions & 0 deletions br/pkg/restore/split/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/codec"
"github.com/stretchr/testify/require"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

func TestBatchSplit(t *testing.T) {
Expand Down Expand Up @@ -266,3 +268,21 @@ func TestSplitMeetErrorAndRetry(t *testing.T) {
_, err = mockClient.SplitKeysAndScatter(ctx, [][]byte{{'d'}})
require.ErrorContains(t, err, "no valid key")
}

func TestPDErrorCanRetry(t *testing.T) {
// non-gRPC error should not retry
err := errors.New("random failure")
require.False(t, PdErrorCanRetry(err))

e1 := status.Error(codes.Unknown, "region 42 is not fully replicated")
require.True(t, PdErrorCanRetry(e1))

e2 := status.Error(codes.Unknown, "operator canceled because cannot add an operator to the execute queue")
require.True(t, PdErrorCanRetry(e2))

e3 := status.Error(codes.Unknown, "unable to create operator, failed to create scatter region operator for region 13813282")
require.True(t, PdErrorCanRetry(e3))

e4 := status.Error(codes.Unknown, "should be false")
require.False(t, PdErrorCanRetry(e4))
}