diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index e7373f67e690e..eda7d39af4e16 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -48,7 +48,12 @@ var ( ErrPDInvalidResponse = errors.Normalize("PD invalid response", errors.RFCCodeText("BR:PD:ErrPDInvalidResponse")) ErrPDBatchScanRegion = errors.Normalize("batch scan region", errors.RFCCodeText("BR:PD:ErrPDBatchScanRegion")) ErrPDUnknownScatterResult = errors.Normalize("failed to wait region scattered", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) +<<<<<<< HEAD ErrPDSplitFailed = errors.Normalize("failed to wait region splitted", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) +======= + ErrPDNotFullyScatter = errors.Normalize("pd not fully scattered", errors.RFCCodeText("BR:PD:ErrPDNotFullyScatter")) + ErrPDSplitFailed = errors.Normalize("failed to wait region split", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) +>>>>>>> bc50e0d3e23 (region scatter client: retry scatter regions when finished percentage is not 100. (#64884)) ErrPDRegionsNotFullyScatter = errors.Normalize("regions not fully scattered", errors.RFCCodeText("BR:PD:ErrPDRegionsNotFullyScatter")) ErrBackupChecksumMismatch = errors.Normalize("backup checksum mismatch", errors.RFCCodeText("BR:Backup:ErrBackupChecksumMismatch")) diff --git a/br/pkg/restore/split/BUILD.bazel b/br/pkg/restore/split/BUILD.bazel index 4b00047f0d75b..5187f05808cf9 100644 --- a/br/pkg/restore/split/BUILD.bazel +++ b/br/pkg/restore/split/BUILD.bazel @@ -63,7 +63,7 @@ go_test( ], embed = [":split"], flaky = True, - shard_count = 28, + shard_count = 29, deps = [ "//br/pkg/errors", "//br/pkg/restore/utils", diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index 18161afa71922..c78516f8ce931 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -188,8 +188,13 @@ func (c *pdClient) needScatter(ctx context.Context) bool { func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) error { log.Info("scatter regions", zap.Int("regions", len(newRegions))) // the retry is for the temporary network errors during sending request. +<<<<<<< HEAD return utils.WithRetry(ctx, func() error { err := c.tryScatterRegions(ctx, newRegions) +======= + err := utils.WithRetry(ctx, func() error { + failedRegionsID, err := c.tryScatterRegions(ctx, newRegions) +>>>>>>> bc50e0d3e23 (region scatter client: retry scatter regions when finished percentage is not 100. (#64884)) // if err is unsupported, we need to fallback to the old method. // ErrPDRegionsNotFullyScatter means the regions are not fully scattered, // in new version of PD, the scatter regions API will return the failed regions id, @@ -227,11 +232,28 @@ func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionIn return errors.Annotatef(berrors.ErrPDInvalidResponse, "pd returns error during batch scattering: %s", pbErr) } +<<<<<<< HEAD if finished := resp.GetFinishedPercentage(); finished < 100 { return errors.Annotatef(berrors.ErrPDRegionsNotFullyScatter, "scatter finished percentage %d less than 100", finished) } return nil +======= + + if len(resp.FailedRegionsId) > 0 { + failedRegionsID := make(map[uint64]struct{}) + for _, id := range resp.FailedRegionsId { + failedRegionsID[id] = struct{}{} + } + return failedRegionsID, nil + } + + if finished := resp.GetFinishedPercentage(); finished < 100 { + return nil, errors.Annotatef(berrors.ErrPDRegionsNotFullyScatter, "scatter finished percentage %d less than 100", finished) + } + + return nil, nil +>>>>>>> bc50e0d3e23 (region scatter client: retry scatter regions when finished percentage is not 100. (#64884)) } func (c *pdClient) GetStore(ctx context.Context, storeID uint64) (*metapb.Store, error) { diff --git a/br/pkg/restore/split/mock_pd_client.go b/br/pkg/restore/split/mock_pd_client.go index 3507314202aa3..e0de99b7c198b 100644 --- a/br/pkg/restore/split/mock_pd_client.go +++ b/br/pkg/restore/split/mock_pd_client.go @@ -190,6 +190,10 @@ type MockPDClientForSplit struct { scatterRegions struct { notImplemented bool regionCount int +<<<<<<< HEAD +======= + failedCount int +>>>>>>> bc50e0d3e23 (region scatter client: retry scatter regions when finished percentage is not 100. (#64884)) finishedPercentage int } getOperator struct { @@ -379,6 +383,16 @@ func (c *MockPDClientForSplit) ScatterRegions(_ context.Context, regionIDs []uin if c.scatterRegions.notImplemented { return nil, status.Error(codes.Unimplemented, "Ah, yep") } +<<<<<<< HEAD +======= + if c.scatterRegions.failedCount > 0 { + c.scatterRegions.failedCount-- + return &pdpb.ScatterRegionResponse{ + FinishedPercentage: 0, + FailedRegionsId: regionIDs[:], + }, nil + } +>>>>>>> bc50e0d3e23 (region scatter client: retry scatter regions when finished percentage is not 100. (#64884)) c.scatterRegions.regionCount += len(regionIDs) * c.scatterRegions.finishedPercentage / 100 return &pdpb.ScatterRegionResponse{FinishedPercentage: uint64(c.scatterRegions.finishedPercentage)}, nil } diff --git a/br/pkg/restore/split/split_test.go b/br/pkg/restore/split/split_test.go index b936a58bec0e5..7f508aed84d71 100644 --- a/br/pkg/restore/split/split_test.go +++ b/br/pkg/restore/split/split_test.go @@ -1075,3 +1075,44 @@ func TestSplitPoint2(t *testing.T) { }) require.NoError(t, err) } + +func TestRegionsNotFullyScatter(t *testing.T) { + mockClient := NewMockPDClientForSplit() + client := pdClient{ + needScatterVal: true, + client: mockClient, + } + client.needScatterInit.Do(func() {}) + ctx := context.Background() + + regions := []*RegionInfo{ + { + Region: &metapb.Region{ + Id: 1, + }, + }, + { + Region: &metapb.Region{ + Id: 2, + }, + }, + } + err := client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2, mockClient.scatterRegions.regionCount) + require.Len(t, mockClient.scatterRegion.count, 0) + + // simulate that one region is not fully scattered when scatterRegions + mockClient.scatterRegions.finishedPercentage = 50 + err = client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2+1, mockClient.scatterRegions.regionCount) + require.Equal(t, map[uint64]int{1: 1, 2: 1}, mockClient.scatterRegion.count) + + // simulate that the regions is not fully scattered when scatterRegion + mockClient.scatterRegion.eachRegionFailBefore = 7 + err = client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2+1+1, mockClient.scatterRegions.regionCount) + require.Equal(t, map[uint64]int{1: 1 + 7, 2: 1 + 7}, mockClient.scatterRegion.count) +} diff --git a/errors.toml b/errors.toml index ec2666a7477d9..f43e49510027b 100644 --- a/errors.toml +++ b/errors.toml @@ -201,6 +201,11 @@ error = ''' regions not fully scattered ''' +["BR:PD:ErrPDRegionsNotFullyScatter"] +error = ''' +regions not fully scattered +''' + ["BR:PD:ErrPDUknownScatterResult"] error = ''' failed to wait region splitted