pingcap · ti-chi-bot · Jun 23, 2025 · Jun 13, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/DEPS.bzl b/DEPS.bzl
diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go
@@ -48,6 +48,7 @@ var (
 	ErrPDInvalidResponse      = errors.Normalize("PD invalid response", errors.RFCCodeText("BR:PD:ErrPDInvalidResponse"))
 	ErrPDBatchScanRegion      = errors.Normalize("batch scan region", errors.RFCCodeText("BR:PD:ErrPDBatchScanRegion"))
 	ErrPDUnknownScatterResult = errors.Normalize("failed to wait region scattered", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult"))
+	ErrPDNotFullyScatter      = errors.Normalize("pd not fully scattered", errors.RFCCodeText("BR:PD:ErrPDNotFullyScatter"))
 	ErrPDSplitFailed          = errors.Normalize("failed to wait region splitted", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult"))
 
 	ErrBackupChecksumMismatch    = errors.Normalize("backup checksum mismatch", errors.RFCCodeText("BR:Backup:ErrBackupChecksumMismatch"))

diff --git a/br/pkg/restore/split/BUILD.bazel b/br/pkg/restore/split/BUILD.bazel
@@ -65,7 +65,7 @@ go_test(
     ],
     embed = [":split"],
     flaky = True,
-    shard_count = 26,
+    shard_count = 28,
     deps = [
         "//br/pkg/errors",
         "//br/pkg/restore/utils",

diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go
@@ -189,21 +189,38 @@ func (c *pdClient) needScatter(ctx context.Context) bool {
 func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) error {
 	log.Info("scatter regions", zap.Int("regions", len(newRegions)))
 	// the retry is for the temporary network errors during sending request.
-	return utils.WithRetry(ctx, func() error {
-		err := c.tryScatterRegions(ctx, newRegions)
+	err := utils.WithRetry(ctx, func() error {
+		failedRegionsID, err := c.tryScatterRegions(ctx, newRegions)
 		if isUnsupportedError(err) {
 			log.Warn("batch scatter isn't supported, rollback to old method", logutil.ShortError(err))
 			c.scatterRegionsSequentially(
 				ctx, newRegions,
-				// backoff about 6s, or we give up scattering this region.
-				utils.NewBackoffRetryAllErrorStrategy(7, 100*time.Millisecond, 2*time.Second))
+				// backoff about 1h total, or we give up scattering this region.
+				utils.NewBackoffRetryAllErrorStrategy(1800, 100*time.Millisecond, 2*time.Second))
 			return nil
 		}
+		// If there are failed regions, retry them
+		if len(failedRegionsID) > 0 {
+			failedRegions := make([]*RegionInfo, 0, len(failedRegionsID))
+			for _, region := range newRegions {
+				if _, exists := failedRegionsID[region.Region.Id]; exists {
+					failedRegions = append(failedRegions, region)
+				}
+			}
+			newRegions = failedRegions
+			return errors.Annotatef(berrors.ErrPDNotFullyScatter,
+				"pd returns error during batch scattering: %d regions failed to scatter", len(failedRegionsID))
+		}
 		return err
-	}, utils.NewBackoffRetryAllErrorStrategy(3, 500*time.Millisecond, 2*time.Second))
+	}, utils.NewBackoffRetryAllErrorStrategy(1800, 500*time.Millisecond, 2*time.Second))
+	if err != nil && berrors.ErrPDNotFullyScatter.Equal(err) {
+		log.Warn("some regions haven't been scattered", zap.Error(err))
+		return nil
+	}
+	return err
 }
 
-func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) error {
+func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) (map[uint64]struct{}, error) {
 	regionsID := make([]uint64, 0, len(regionInfo))
 	for _, v := range regionInfo {
 		regionsID = append(regionsID, v.Region.Id)
@@ -213,13 +230,19 @@ func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionIn
 	}
 	resp, err := c.client.ScatterRegions(ctx, regionsID, opt.WithSkipStoreLimit())
 	if err != nil {
-		return err
+		return nil, err
 	}
 	if pbErr := resp.GetHeader().GetError(); pbErr.GetType() != pdpb.ErrorType_OK {
-		return errors.Annotatef(berrors.ErrPDInvalidResponse,
+		return nil, errors.Annotatef(berrors.ErrPDInvalidResponse,
 			"pd returns error during batch scattering: %s", pbErr)
 	}
-	return nil
+
+	failedRegionsID := make(map[uint64]struct{})
+	for _, id := range resp.FailedRegionsId {
+		failedRegionsID[id] = struct{}{}
+	}
+
+	return failedRegionsID, nil
 }
 
 func (c *pdClient) GetStore(ctx context.Context, storeID uint64) (*metapb.Store, error) {
@@ -1001,6 +1024,8 @@ func PdErrorCanRetry(err error) bool {
 	// (1) region %d has no leader
 	// (2) region %d is hot
 	// (3) region %d is not fully replicated
+	// (4) operator canceled because cannot add an operator to the execute queue [PD:store-limit]
+	// (5) failed to create scatter region operator [PD:schedule:ErrCreateOperator]
 	//
 	// (2) shouldn't happen in a recently splitted region.
 	// (1) and (3) might happen, and should be retried.
@@ -1009,7 +1034,9 @@ func PdErrorCanRetry(err error) bool {
 		return false
 	}
 	return strings.Contains(grpcErr.Message(), "is not fully replicated") ||
-		strings.Contains(grpcErr.Message(), "has no leader")
+		strings.Contains(grpcErr.Message(), "has no leader") ||
+		strings.Contains(grpcErr.Message(), "cannot add an operator to the execute queue") ||
+		strings.Contains(grpcErr.Message(), "failed to create scatter region operator")
 }
 
 // NextBackoff returns a duration to wait before retrying again.

diff --git a/br/pkg/restore/split/client_test.go b/br/pkg/restore/split/client_test.go
@@ -14,6 +14,8 @@ import (
 	"github.com/pingcap/tidb/pkg/types"
 	"github.com/pingcap/tidb/pkg/util/codec"
 	"github.com/stretchr/testify/require"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 )
 
 func TestBatchSplit(t *testing.T) {
@@ -301,3 +303,21 @@ func TestSplitMeetErrorAndRetry(t *testing.T) {
 	_, err = mockClient.SplitKeysAndScatter(ctx, [][]byte{{'d'}})
 	require.ErrorContains(t, err, "no valid key")
 }
+
+func TestPDErrorCanRetry(t *testing.T) {
+	// non-gRPC error should not retry
+	err := errors.New("random failure")
+	require.False(t, PdErrorCanRetry(err))
+
+	e1 := status.Error(codes.Unknown, "region 42 is not fully replicated")
+	require.True(t, PdErrorCanRetry(e1))
+
+	e2 := status.Error(codes.Unknown, "operator canceled because cannot add an operator to the execute queue")
+	require.True(t, PdErrorCanRetry(e2))
+
+	e3 := status.Error(codes.Unknown, "unable to create operator, failed to create scatter region operator for region 13813282")
+	require.True(t, PdErrorCanRetry(e3))
+
+	e4 := status.Error(codes.Unknown, "should be false")
+	require.False(t, PdErrorCanRetry(e4))
+}
diff --git a/br/pkg/restore/split/mock_pd_client.go b/br/pkg/restore/split/mock_pd_client.go
@@ -194,6 +194,7 @@ type MockPDClientForSplit struct {
 	scatterRegions struct {
 		notImplemented bool
 		regionCount    int
+		failedCount    int
 	}
 	getOperator struct {
 		responses map[uint64][]*pdpb.GetOperatorResponse
@@ -381,6 +382,13 @@ func (c *MockPDClientForSplit) ScatterRegions(_ context.Context, regionIDs []uin
 	if c.scatterRegions.notImplemented {
 		return nil, status.Error(codes.Unimplemented, "Ah, yep")
 	}
+	if c.scatterRegions.failedCount > 0 {
+		c.scatterRegions.failedCount--
+		return &pdpb.ScatterRegionResponse{
+			FinishedPercentage: 0,
+			FailedRegionsId:    regionIDs[:],
+		}, nil
+	}
 	c.scatterRegions.regionCount += len(regionIDs)
 	return &pdpb.ScatterRegionResponse{}, nil
 }

diff --git a/br/pkg/restore/split/split_test.go b/br/pkg/restore/split/split_test.go
@@ -134,6 +134,33 @@ func TestScatterSequentiallyRetryCnt(t *testing.T) {
 	require.Equal(t, 7, backoffer.already)
 }
 
+// TestBatchScatterRegionsRetryCnt tests the retry count of BatchScatterRegions.
+func TestBatchScatterRegionsRetryCnt(t *testing.T) {
+	mockClient := NewMockPDClientForSplit()
+	mockClient.scatterRegions.failedCount = 7
+	client := pdClient{
+		needScatterVal: true,
+		client:         mockClient,
+	}
+	client.needScatterInit.Do(func() {})
+
+	ctx := context.Background()
+	regions := []*RegionInfo{
+		{
+			Region: &metapb.Region{
+				Id: 1,
+			},
+		},
+		{
+			Region: &metapb.Region{
+				Id: 2,
+			},
+		},
+	}
+	err := client.scatterRegions(ctx, regions)
+	require.NoError(t, err)
+}
+
 func TestScatterBackwardCompatibility(t *testing.T) {
 	mockClient := NewMockPDClientForSplit()
 	mockClient.scatterRegions.notImplemented = true

diff --git a/errors.toml b/errors.toml
@@ -196,6 +196,11 @@ error = '''
 PD leader not found
 '''
 
+["BR:PD:ErrPDNotFullyScatter"]
+error = '''
+pd not fully scattered
+'''
+
 ["BR:PD:ErrPDUknownScatterResult"]
 error = '''
 failed to wait region splitted

diff --git a/go.mod b/go.mod
@@ -3,8 +3,8 @@ module github.com/pingcap/tidb
 go 1.23.10
 
 require (
-	cloud.google.com/go/kms v1.15.7
-	cloud.google.com/go/storage v1.38.0
+	cloud.google.com/go/kms v1.15.8
+	cloud.google.com/go/storage v1.39.1
 	github.com/Azure/azure-sdk-for-go/sdk/azcore v1.14.0
 	github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.7.0
 	github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.0.0
@@ -89,7 +89,7 @@ require (
 	github.com/pingcap/errors v0.11.5-0.20250523034308-74f78ae071ee
 	github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86
 	github.com/pingcap/fn v1.0.0
-	github.com/pingcap/kvproto v0.0.0-20250521074834-db74bf0e3ac1
+	github.com/pingcap/kvproto v0.0.0-20250605100108-dc99a8f6e348
 	github.com/pingcap/log v1.1.1-0.20250514022801-14f3b4ca066e
 	github.com/pingcap/sysutil v1.0.1-0.20240311050922-ae81ee01f3a5
 	github.com/pingcap/tidb/pkg/parser v0.0.0-20211011031125-9b13dc409c5e
@@ -121,10 +121,10 @@ require (
 	github.com/xitongsys/parquet-go v1.6.3-0.20240520233950-75e935fc3e17
 	github.com/xitongsys/parquet-go-source v0.0.0-20200817004010-026bad9b25d0
 	github.com/zyedidia/generic v1.2.1
-	go.etcd.io/etcd/api/v3 v3.5.12
-	go.etcd.io/etcd/client/pkg/v3 v3.5.12
-	go.etcd.io/etcd/client/v3 v3.5.12
-	go.etcd.io/etcd/server/v3 v3.5.12
+	go.etcd.io/etcd/api/v3 v3.5.15
+	go.etcd.io/etcd/client/pkg/v3 v3.5.15
+	go.etcd.io/etcd/client/v3 v3.5.15
+	go.etcd.io/etcd/server/v3 v3.5.15
 	go.etcd.io/etcd/tests/v3 v3.5.12
 	go.opencensus.io v0.24.0
 	go.uber.org/atomic v1.11.0
@@ -142,11 +142,11 @@ require (
 	golang.org/x/text v0.25.0
 	golang.org/x/time v0.11.0
 	golang.org/x/tools v0.33.0
-	google.golang.org/api v0.169.0
+	google.golang.org/api v0.170.0
 	google.golang.org/grpc v1.63.2
 	gopkg.in/yaml.v2 v2.4.0
 	gorm.io/driver/mysql v1.5.7
-	gorm.io/gorm v1.25.11
+	gorm.io/gorm v1.25.12
 	honnef.co/go/tools v0.6.1
 	k8s.io/api v0.29.11
 	sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0
@@ -167,7 +167,7 @@ require (
 	github.com/jinzhu/inflection v1.0.0 // indirect
 	github.com/jinzhu/now v1.1.5 // indirect
 	github.com/klauspost/asmfmt v1.3.2 // indirect
-	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
+	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
 	github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
 	github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
@@ -182,10 +182,10 @@ require (
 )
 
 require (
-	cloud.google.com/go v0.112.1 // indirect
+	cloud.google.com/go v0.112.2 // indirect
 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
-	cloud.google.com/go/iam v1.1.6 // indirect
-	cloud.google.com/go/pubsub v1.36.1 // indirect
+	cloud.google.com/go/iam v1.1.7 // indirect
+	cloud.google.com/go/pubsub v1.37.0 // indirect
 	github.com/Azure/azure-sdk-for-go/sdk/internal v1.10.0 // indirect
 	github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect
 	github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 // indirect
@@ -228,7 +228,7 @@ require (
 	github.com/google/renameio/v2 v2.0.0 // indirect
 	github.com/google/s2a-go v0.1.7 // indirect
 	github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
-	github.com/googleapis/gax-go/v2 v2.12.2 // indirect
+	github.com/googleapis/gax-go/v2 v2.12.3 // indirect
 	github.com/gorilla/handlers v1.5.1 // indirect
 	github.com/gorilla/websocket v1.5.1 // indirect
 	github.com/gostaticanalysis/analysisutil v0.7.1 // indirect
@@ -294,10 +294,10 @@ require (
 	github.com/uber/jaeger-lib v2.4.1+incompatible // indirect
 	github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 // indirect
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
-	go.etcd.io/bbolt v1.3.8 // indirect
-	go.etcd.io/etcd/client/v2 v2.305.12 // indirect
-	go.etcd.io/etcd/pkg/v3 v3.5.12 // indirect
-	go.etcd.io/etcd/raft/v3 v3.5.12 // indirect
+	go.etcd.io/bbolt v1.3.10 // indirect
+	go.etcd.io/etcd/client/v2 v2.305.15 // indirect
+	go.etcd.io/etcd/pkg/v3 v3.5.15 // indirect
+	go.etcd.io/etcd/raft/v3 v3.5.15 // indirect
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
 	go.opentelemetry.io/otel v1.24.0 // indirect
@@ -311,12 +311,12 @@ require (
 	golang.org/x/exp/typeparams v0.0.0-20250210185358-939b2ce775ac // indirect
 	golang.org/x/mod v0.24.0 // indirect
 	golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
-	google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de // indirect
-	google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 // indirect
+	google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20240401170217-c3f982113cda // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240515191416-fc5f0ca64291 // indirect
 	google.golang.org/protobuf v1.36.6
 	gopkg.in/inf.v0 v0.9.1 // indirect
-	gopkg.in/natefinch/lumberjack.v2 v2.2.1
+	gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 	k8s.io/apimachinery v0.29.11 // indirect
 	k8s.io/klog/v2 v2.120.1 // indirect