Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,548 changes: 774 additions & 774 deletions DEPS.bzl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions br/pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ var (
ErrPDInvalidResponse = errors.Normalize("PD invalid response", errors.RFCCodeText("BR:PD:ErrPDInvalidResponse"))
ErrPDBatchScanRegion = errors.Normalize("batch scan region", errors.RFCCodeText("BR:PD:ErrPDBatchScanRegion"))
ErrPDUnknownScatterResult = errors.Normalize("failed to wait region scattered", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult"))
ErrPDNotFullyScatter = errors.Normalize("pd not fully scattered", errors.RFCCodeText("BR:PD:ErrPDNotFullyScatter"))
ErrPDSplitFailed = errors.Normalize("failed to wait region splitted", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult"))

ErrBackupChecksumMismatch = errors.Normalize("backup checksum mismatch", errors.RFCCodeText("BR:Backup:ErrBackupChecksumMismatch"))
Expand Down
2 changes: 1 addition & 1 deletion br/pkg/restore/split/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ go_test(
],
embed = [":split"],
flaky = True,
shard_count = 26,
shard_count = 28,
deps = [
"//br/pkg/errors",
"//br/pkg/restore/utils",
Expand Down
47 changes: 37 additions & 10 deletions br/pkg/restore/split/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,38 @@ func (c *pdClient) needScatter(ctx context.Context) bool {
func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) error {
log.Info("scatter regions", zap.Int("regions", len(newRegions)))
// the retry is for the temporary network errors during sending request.
return utils.WithRetry(ctx, func() error {
err := c.tryScatterRegions(ctx, newRegions)
err := utils.WithRetry(ctx, func() error {
failedRegionsID, err := c.tryScatterRegions(ctx, newRegions)
if isUnsupportedError(err) {
log.Warn("batch scatter isn't supported, rollback to old method", logutil.ShortError(err))
c.scatterRegionsSequentially(
ctx, newRegions,
// backoff about 6s, or we give up scattering this region.
utils.NewBackoffRetryAllErrorStrategy(7, 100*time.Millisecond, 2*time.Second))
// backoff about 1h total, or we give up scattering this region.
utils.NewBackoffRetryAllErrorStrategy(1800, 100*time.Millisecond, 2*time.Second))
return nil
}
// If there are failed regions, retry them
if len(failedRegionsID) > 0 {
failedRegions := make([]*RegionInfo, 0, len(failedRegionsID))
for _, region := range newRegions {
if _, exists := failedRegionsID[region.Region.Id]; exists {
failedRegions = append(failedRegions, region)
}
}
newRegions = failedRegions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m concerned that the failure-handling logic here may not be robust. If a region is mistakenly judged as needing to be scattered and the operation fails, it appears that this region won’t be retried later.

Consider this scenario: a small number of regions fail to add scatter operators, while others succeed. However, due to the asynchronous nature of operator execution, even regions that successfully had operators added might still time out or encounter other issues afterward. Does this scenario align with the intended design?

return errors.Annotatef(berrors.ErrPDNotFullyScatter,
"pd returns error during batch scattering: %d regions failed to scatter", len(failedRegionsID))
}
return err
}, utils.NewBackoffRetryAllErrorStrategy(3, 500*time.Millisecond, 2*time.Second))
}, utils.NewBackoffRetryAllErrorStrategy(1800, 500*time.Millisecond, 2*time.Second))
if err != nil && berrors.ErrPDNotFullyScatter.Equal(err) {
log.Warn("some regions haven't been scattered", zap.Error(err))
return nil
}
return err
}

func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) error {
func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) (map[uint64]struct{}, error) {
regionsID := make([]uint64, 0, len(regionInfo))
for _, v := range regionInfo {
regionsID = append(regionsID, v.Region.Id)
Expand All @@ -213,13 +230,19 @@ func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionIn
}
resp, err := c.client.ScatterRegions(ctx, regionsID, opt.WithSkipStoreLimit())
if err != nil {
return err
return nil, err
}
if pbErr := resp.GetHeader().GetError(); pbErr.GetType() != pdpb.ErrorType_OK {
return errors.Annotatef(berrors.ErrPDInvalidResponse,
return nil, errors.Annotatef(berrors.ErrPDInvalidResponse,
"pd returns error during batch scattering: %s", pbErr)
}
return nil

failedRegionsID := make(map[uint64]struct{})
for _, id := range resp.FailedRegionsId {
failedRegionsID[id] = struct{}{}
}

return failedRegionsID, nil
}

func (c *pdClient) GetStore(ctx context.Context, storeID uint64) (*metapb.Store, error) {
Expand Down Expand Up @@ -1001,6 +1024,8 @@ func PdErrorCanRetry(err error) bool {
// (1) region %d has no leader
// (2) region %d is hot
// (3) region %d is not fully replicated
// (4) operator canceled because cannot add an operator to the execute queue [PD:store-limit]
// (5) failed to create scatter region operator [PD:schedule:ErrCreateOperator]
//
// (2) shouldn't happen in a recently splitted region.
// (1) and (3) might happen, and should be retried.
Expand All @@ -1009,7 +1034,9 @@ func PdErrorCanRetry(err error) bool {
return false
}
return strings.Contains(grpcErr.Message(), "is not fully replicated") ||
strings.Contains(grpcErr.Message(), "has no leader")
strings.Contains(grpcErr.Message(), "has no leader") ||
strings.Contains(grpcErr.Message(), "cannot add an operator to the execute queue") ||
strings.Contains(grpcErr.Message(), "failed to create scatter region operator")
}

// NextBackoff returns a duration to wait before retrying again.
Expand Down
20 changes: 20 additions & 0 deletions br/pkg/restore/split/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/codec"
"github.com/stretchr/testify/require"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

func TestBatchSplit(t *testing.T) {
Expand Down Expand Up @@ -301,3 +303,21 @@ func TestSplitMeetErrorAndRetry(t *testing.T) {
_, err = mockClient.SplitKeysAndScatter(ctx, [][]byte{{'d'}})
require.ErrorContains(t, err, "no valid key")
}

func TestPDErrorCanRetry(t *testing.T) {
// non-gRPC error should not retry
err := errors.New("random failure")
require.False(t, PdErrorCanRetry(err))

e1 := status.Error(codes.Unknown, "region 42 is not fully replicated")
require.True(t, PdErrorCanRetry(e1))

e2 := status.Error(codes.Unknown, "operator canceled because cannot add an operator to the execute queue")
require.True(t, PdErrorCanRetry(e2))

e3 := status.Error(codes.Unknown, "unable to create operator, failed to create scatter region operator for region 13813282")
require.True(t, PdErrorCanRetry(e3))

e4 := status.Error(codes.Unknown, "should be false")
require.False(t, PdErrorCanRetry(e4))
}
8 changes: 8 additions & 0 deletions br/pkg/restore/split/mock_pd_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ type MockPDClientForSplit struct {
scatterRegions struct {
notImplemented bool
regionCount int
failedCount int
}
getOperator struct {
responses map[uint64][]*pdpb.GetOperatorResponse
Expand Down Expand Up @@ -381,6 +382,13 @@ func (c *MockPDClientForSplit) ScatterRegions(_ context.Context, regionIDs []uin
if c.scatterRegions.notImplemented {
return nil, status.Error(codes.Unimplemented, "Ah, yep")
}
if c.scatterRegions.failedCount > 0 {
c.scatterRegions.failedCount--
return &pdpb.ScatterRegionResponse{
FinishedPercentage: 0,
FailedRegionsId: regionIDs[:],
}, nil
}
c.scatterRegions.regionCount += len(regionIDs)
return &pdpb.ScatterRegionResponse{}, nil
}
Expand Down
27 changes: 27 additions & 0 deletions br/pkg/restore/split/split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,33 @@ func TestScatterSequentiallyRetryCnt(t *testing.T) {
require.Equal(t, 7, backoffer.already)
}

// TestBatchScatterRegionsRetryCnt tests the retry count of BatchScatterRegions.
func TestBatchScatterRegionsRetryCnt(t *testing.T) {
mockClient := NewMockPDClientForSplit()
mockClient.scatterRegions.failedCount = 7
client := pdClient{
needScatterVal: true,
client: mockClient,
}
client.needScatterInit.Do(func() {})

ctx := context.Background()
regions := []*RegionInfo{
{
Region: &metapb.Region{
Id: 1,
},
},
{
Region: &metapb.Region{
Id: 2,
},
},
}
err := client.scatterRegions(ctx, regions)
require.NoError(t, err)
}

func TestScatterBackwardCompatibility(t *testing.T) {
mockClient := NewMockPDClientForSplit()
mockClient.scatterRegions.notImplemented = true
Expand Down
5 changes: 5 additions & 0 deletions errors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ error = '''
PD leader not found
'''

["BR:PD:ErrPDNotFullyScatter"]
error = '''
pd not fully scattered
'''

["BR:PD:ErrPDUknownScatterResult"]
error = '''
failed to wait region splitted
Expand Down
42 changes: 21 additions & 21 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ module github.com/pingcap/tidb
go 1.23.10

require (
cloud.google.com/go/kms v1.15.7
cloud.google.com/go/storage v1.38.0
cloud.google.com/go/kms v1.15.8
cloud.google.com/go/storage v1.39.1
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.14.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.7.0
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.0.0
Expand Down Expand Up @@ -89,7 +89,7 @@ require (
github.com/pingcap/errors v0.11.5-0.20250523034308-74f78ae071ee
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86
github.com/pingcap/fn v1.0.0
github.com/pingcap/kvproto v0.0.0-20250521074834-db74bf0e3ac1
github.com/pingcap/kvproto v0.0.0-20250605100108-dc99a8f6e348
github.com/pingcap/log v1.1.1-0.20250514022801-14f3b4ca066e
github.com/pingcap/sysutil v1.0.1-0.20240311050922-ae81ee01f3a5
github.com/pingcap/tidb/pkg/parser v0.0.0-20211011031125-9b13dc409c5e
Expand Down Expand Up @@ -121,10 +121,10 @@ require (
github.com/xitongsys/parquet-go v1.6.3-0.20240520233950-75e935fc3e17
github.com/xitongsys/parquet-go-source v0.0.0-20200817004010-026bad9b25d0
github.com/zyedidia/generic v1.2.1
go.etcd.io/etcd/api/v3 v3.5.12
go.etcd.io/etcd/client/pkg/v3 v3.5.12
go.etcd.io/etcd/client/v3 v3.5.12
go.etcd.io/etcd/server/v3 v3.5.12
go.etcd.io/etcd/api/v3 v3.5.15
go.etcd.io/etcd/client/pkg/v3 v3.5.15
go.etcd.io/etcd/client/v3 v3.5.15
go.etcd.io/etcd/server/v3 v3.5.15
go.etcd.io/etcd/tests/v3 v3.5.12
go.opencensus.io v0.24.0
go.uber.org/atomic v1.11.0
Expand All @@ -142,11 +142,11 @@ require (
golang.org/x/text v0.25.0
golang.org/x/time v0.11.0
golang.org/x/tools v0.33.0
google.golang.org/api v0.169.0
google.golang.org/api v0.170.0
google.golang.org/grpc v1.63.2
gopkg.in/yaml.v2 v2.4.0
gorm.io/driver/mysql v1.5.7
gorm.io/gorm v1.25.11
gorm.io/gorm v1.25.12
honnef.co/go/tools v0.6.1
k8s.io/api v0.29.11
sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0
Expand All @@ -167,7 +167,7 @@ require (
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/klauspost/asmfmt v1.3.2 // indirect
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
Expand All @@ -182,10 +182,10 @@ require (
)

require (
cloud.google.com/go v0.112.1 // indirect
cloud.google.com/go v0.112.2 // indirect
cloud.google.com/go/compute/metadata v0.3.0 // indirect
cloud.google.com/go/iam v1.1.6 // indirect
cloud.google.com/go/pubsub v1.36.1 // indirect
cloud.google.com/go/iam v1.1.7 // indirect
cloud.google.com/go/pubsub v1.37.0 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0 // indirect
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 // indirect
Expand Down Expand Up @@ -228,7 +228,7 @@ require (
github.com/google/renameio/v2 v2.0.0 // indirect
github.com/google/s2a-go v0.1.7 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.3 // indirect
github.com/gorilla/handlers v1.5.1 // indirect
github.com/gorilla/websocket v1.5.1 // indirect
github.com/gostaticanalysis/analysisutil v0.7.1 // indirect
Expand Down Expand Up @@ -294,10 +294,10 @@ require (
github.com/uber/jaeger-lib v2.4.1+incompatible // indirect
github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.etcd.io/bbolt v1.3.8 // indirect
go.etcd.io/etcd/client/v2 v2.305.12 // indirect
go.etcd.io/etcd/pkg/v3 v3.5.12 // indirect
go.etcd.io/etcd/raft/v3 v3.5.12 // indirect
go.etcd.io/bbolt v1.3.10 // indirect
go.etcd.io/etcd/client/v2 v2.305.15 // indirect
go.etcd.io/etcd/pkg/v3 v3.5.15 // indirect
go.etcd.io/etcd/raft/v3 v3.5.15 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
go.opentelemetry.io/otel v1.24.0 // indirect
Expand All @@ -311,12 +311,12 @@ require (
golang.org/x/exp/typeparams v0.0.0-20250210185358-939b2ce775ac // indirect
golang.org/x/mod v0.24.0 // indirect
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 // indirect
google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240401170217-c3f982113cda // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 // indirect
google.golang.org/protobuf v1.36.6
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apimachinery v0.29.11 // indirect
k8s.io/klog/v2 v2.120.1 // indirect
Expand Down
Loading
Loading