From 8d595de8427956f834571305e8da436efce9f295 Mon Sep 17 00:00:00 2001 From: GMHDBJD <35025882+GMHDBJD@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:10:18 +0800 Subject: [PATCH 01/16] importinto: retry scatter regions when finished percentage is not 100 (#60315) ref pingcap/tidb#60077 --- br/pkg/errors/errors.go | 13 ++++---- br/pkg/restore/split/BUILD.bazel | 2 +- br/pkg/restore/split/client.go | 13 ++++++-- br/pkg/restore/split/mock_pd_client.go | 10 ++++--- br/pkg/restore/split/split_test.go | 41 ++++++++++++++++++++++++++ errors.toml | 5 ++++ 6 files changed, 71 insertions(+), 13 deletions(-) diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index 3bd2ab776ccb3..ce9793d385ac4 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -41,12 +41,13 @@ var ( ErrUnsupportedOperation = errors.Normalize("the operation is not supported", errors.RFCCodeText("BR:Common:ErrUnsupportedOperation")) ErrInvalidRange = errors.Normalize("invalid restore range", errors.RFCCodeText("BR:Common:ErrInvalidRange")) - ErrPDUpdateFailed = errors.Normalize("failed to update PD", errors.RFCCodeText("BR:PD:ErrPDUpdateFailed")) - ErrPDLeaderNotFound = errors.Normalize("PD leader not found", errors.RFCCodeText("BR:PD:ErrPDLeaderNotFound")) - ErrPDInvalidResponse = errors.Normalize("PD invalid response", errors.RFCCodeText("BR:PD:ErrPDInvalidResponse")) - ErrPDBatchScanRegion = errors.Normalize("batch scan region", errors.RFCCodeText("BR:PD:ErrPDBatchScanRegion")) - ErrPDUnknownScatterResult = errors.Normalize("failed to wait region scattered", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) - ErrPDSplitFailed = errors.Normalize("failed to wait region splitted", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) + ErrPDUpdateFailed = errors.Normalize("failed to update PD", errors.RFCCodeText("BR:PD:ErrPDUpdateFailed")) + ErrPDLeaderNotFound = errors.Normalize("PD leader not found", errors.RFCCodeText("BR:PD:ErrPDLeaderNotFound")) + ErrPDInvalidResponse = errors.Normalize("PD invalid response", errors.RFCCodeText("BR:PD:ErrPDInvalidResponse")) + ErrPDBatchScanRegion = errors.Normalize("batch scan region", errors.RFCCodeText("BR:PD:ErrPDBatchScanRegion")) + ErrPDUnknownScatterResult = errors.Normalize("failed to wait region scattered", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) + ErrPDSplitFailed = errors.Normalize("failed to wait region splitted", errors.RFCCodeText("BR:PD:ErrPDUknownScatterResult")) + ErrPDRegionsNotFullyScatter = errors.Normalize("regions not fully scattered", errors.RFCCodeText("BR:PD:ErrPDRegionsNotFullyScatter")) ErrBackupChecksumMismatch = errors.Normalize("backup checksum mismatch", errors.RFCCodeText("BR:Backup:ErrBackupChecksumMismatch")) ErrBackupInvalidRange = errors.Normalize("backup range invalid", errors.RFCCodeText("BR:Backup:ErrBackupInvalidRange")) diff --git a/br/pkg/restore/split/BUILD.bazel b/br/pkg/restore/split/BUILD.bazel index 2d7002b493ad2..5cb9204bb891f 100644 --- a/br/pkg/restore/split/BUILD.bazel +++ b/br/pkg/restore/split/BUILD.bazel @@ -55,7 +55,7 @@ go_test( ], embed = [":split"], flaky = True, - shard_count = 19, + shard_count = 20, deps = [ "//br/pkg/errors", "//br/pkg/utils", diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index fd3af4ee16fae..40542fc27d1c2 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -190,8 +190,13 @@ func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) // the retry is for the temporary network errors during sending request. return utils.WithRetry(ctx, func() error { err := c.tryScatterRegions(ctx, newRegions) - if isUnsupportedError(err) { - log.Warn("batch scatter isn't supported, rollback to old method", logutil.ShortError(err)) + // if err is unsupported, we need to fallback to the old method. + // ErrPDRegionsNotFullyScatter means the regions are not fully scattered, + // in new version of PD, the scatter regions API will return the failed regions id, + // but the old version of PD will only return the FinishedPercentage. + // so we need to retry the regions one by one. + if isUnsupportedError(err) || berrors.ErrPDRegionsNotFullyScatter.Equal(err) { + log.Warn("failed to batch scatter regions, rollback to sequentially scatter", logutil.ShortError(err)) c.scatterRegionsSequentially( ctx, newRegions, // backoff about 6s, or we give up scattering this region. @@ -221,6 +226,10 @@ func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionIn return errors.Annotatef(berrors.ErrPDInvalidResponse, "pd returns error during batch scattering: %s", pbErr) } + if finished := resp.GetFinishedPercentage(); finished < 100 { + return errors.Annotatef(berrors.ErrPDRegionsNotFullyScatter, "scatter finished percentage %d less than 100", finished) + } + return nil } diff --git a/br/pkg/restore/split/mock_pd_client.go b/br/pkg/restore/split/mock_pd_client.go index 92cd055939926..bcb4134747dff 100644 --- a/br/pkg/restore/split/mock_pd_client.go +++ b/br/pkg/restore/split/mock_pd_client.go @@ -39,8 +39,9 @@ type MockPDClientForSplit struct { count map[uint64]int } scatterRegions struct { - notImplemented bool - regionCount int + notImplemented bool + regionCount int + finishedPercentage int } getOperator struct { responses map[uint64][]*pdpb.GetOperatorResponse @@ -52,6 +53,7 @@ func NewMockPDClientForSplit() *MockPDClientForSplit { ret := &MockPDClientForSplit{} ret.Regions = &pdtypes.RegionTree{} ret.scatterRegion.count = make(map[uint64]int) + ret.scatterRegions.finishedPercentage = 100 return ret } @@ -221,8 +223,8 @@ func (c *MockPDClientForSplit) ScatterRegions(_ context.Context, regionIDs []uin if c.scatterRegions.notImplemented { return nil, status.Error(codes.Unimplemented, "Ah, yep") } - c.scatterRegions.regionCount += len(regionIDs) - return &pdpb.ScatterRegionResponse{}, nil + c.scatterRegions.regionCount += len(regionIDs) * c.scatterRegions.finishedPercentage / 100 + return &pdpb.ScatterRegionResponse{FinishedPercentage: uint64(c.scatterRegions.finishedPercentage)}, nil } func (c *MockPDClientForSplit) GetOperator(_ context.Context, regionID uint64) (*pdpb.GetOperatorResponse, error) { diff --git a/br/pkg/restore/split/split_test.go b/br/pkg/restore/split/split_test.go index 2250f7a96635c..b1704adafb02f 100644 --- a/br/pkg/restore/split/split_test.go +++ b/br/pkg/restore/split/split_test.go @@ -756,3 +756,44 @@ func TestScanRegionsWithRetry(t *testing.T) { require.Equal(t, []byte("2"), regions[1].Region.StartKey) } } + +func TestRegionsNotFullyScatter(t *testing.T) { + mockClient := NewMockPDClientForSplit() + client := pdClient{ + needScatterVal: true, + client: mockClient, + } + client.needScatterInit.Do(func() {}) + ctx := context.Background() + + regions := []*RegionInfo{ + { + Region: &metapb.Region{ + Id: 1, + }, + }, + { + Region: &metapb.Region{ + Id: 2, + }, + }, + } + err := client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2, mockClient.scatterRegions.regionCount) + require.Len(t, mockClient.scatterRegion.count, 0) + + // simulate that one region is not fully scattered when scatterRegions + mockClient.scatterRegions.finishedPercentage = 50 + err = client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2+1, mockClient.scatterRegions.regionCount) + require.Equal(t, map[uint64]int{1: 1, 2: 1}, mockClient.scatterRegion.count) + + // simulate that the regions is not fully scattered when scatterRegion + mockClient.scatterRegion.eachRegionFailBefore = 7 + err = client.scatterRegions(ctx, regions) + require.NoError(t, err) + require.Equal(t, 2+1+1, mockClient.scatterRegions.regionCount) + require.Equal(t, map[uint64]int{1: 1 + 7, 2: 1 + 7}, mockClient.scatterRegion.count) +} diff --git a/errors.toml b/errors.toml index 1f9329951dbdf..a2942dddb2e77 100644 --- a/errors.toml +++ b/errors.toml @@ -186,6 +186,11 @@ error = ''' PD leader not found ''' +["BR:PD:ErrPDRegionsNotFullyScatter"] +error = ''' +regions not fully scattered +''' + ["BR:PD:ErrPDUknownScatterResult"] error = ''' failed to wait region splitted From 69e076545eb1ce2d7bf5c0f9023835c745245a9a Mon Sep 17 00:00:00 2001 From: GMHDBJD <35025882+GMHDBJD@users.noreply.github.com> Date: Sun, 27 Apr 2025 09:58:18 +0800 Subject: [PATCH 02/16] lightning: added retry times and two new retry reasons (#60839) ref pingcap/tidb#60077 --- br/pkg/restore/split/BUILD.bazel | 2 +- br/pkg/restore/split/client.go | 18 ++++++++++++++---- br/pkg/restore/split/client_test.go | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/br/pkg/restore/split/BUILD.bazel b/br/pkg/restore/split/BUILD.bazel index 5cb9204bb891f..30e6ce25d51e8 100644 --- a/br/pkg/restore/split/BUILD.bazel +++ b/br/pkg/restore/split/BUILD.bazel @@ -55,7 +55,7 @@ go_test( ], embed = [":split"], flaky = True, - shard_count = 20, + shard_count = 21, deps = [ "//br/pkg/errors", "//br/pkg/utils", diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index 40542fc27d1c2..8da0d18b00887 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -199,15 +199,16 @@ func (c *pdClient) scatterRegions(ctx context.Context, newRegions []*RegionInfo) log.Warn("failed to batch scatter regions, rollback to sequentially scatter", logutil.ShortError(err)) c.scatterRegionsSequentially( ctx, newRegions, - // backoff about 6s, or we give up scattering this region. + // backoff about 1h total, or we give up scattering this region. &ExponentialBackoffer{ - Attempts: 7, + Attempts: 1800, BaseBackoff: 100 * time.Millisecond, + MaxDelay: 2 * time.Second, }) return nil } return err - }, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond}) + }, &ExponentialBackoffer{Attempts: 3, BaseBackoff: 500 * time.Millisecond, MaxDelay: 2 * time.Second}) } func (c *pdClient) tryScatterRegions(ctx context.Context, regionInfo []*RegionInfo) error { @@ -1012,6 +1013,7 @@ func CheckRegionEpoch(_new, _old *RegionInfo) bool { type ExponentialBackoffer struct { Attempts int BaseBackoff time.Duration + MaxDelay time.Duration } func (b *ExponentialBackoffer) exponentialBackoff() time.Duration { @@ -1021,6 +1023,9 @@ func (b *ExponentialBackoffer) exponentialBackoff() time.Duration { return 0 } b.BaseBackoff *= 2 + if b.MaxDelay > 0 && b.BaseBackoff > b.MaxDelay { + b.BaseBackoff = b.MaxDelay + } return bo } @@ -1033,12 +1038,17 @@ func PdErrorCanRetry(err error) bool { // // (2) shouldn't happen in a recently splitted region. // (1) and (3) might happen, and should be retried. + // + // (4) operator canceled because cannot add an operator to the execute queue [PD:store-limit] + // (5) failed to create scatter region operator [PD:schedule:ErrCreateOperator] grpcErr := status.Convert(err) if grpcErr == nil { return false } return strings.Contains(grpcErr.Message(), "is not fully replicated") || - strings.Contains(grpcErr.Message(), "has no leader") + strings.Contains(grpcErr.Message(), "has no leader") || + strings.Contains(grpcErr.Message(), "cannot add an operator to the execute queue") || + strings.Contains(grpcErr.Message(), "failed to create scatter region operator") } // NextBackoff returns a duration to wait before retrying again. diff --git a/br/pkg/restore/split/client_test.go b/br/pkg/restore/split/client_test.go index e881078777224..e9e681bf02e51 100644 --- a/br/pkg/restore/split/client_test.go +++ b/br/pkg/restore/split/client_test.go @@ -14,6 +14,8 @@ import ( "github.com/pingcap/tidb/pkg/types" "github.com/pingcap/tidb/pkg/util/codec" "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) func TestBatchSplit(t *testing.T) { @@ -301,3 +303,21 @@ func TestSplitMeetErrorAndRetry(t *testing.T) { _, err = mockClient.SplitKeysAndScatter(ctx, [][]byte{{'d'}}) require.ErrorContains(t, err, "no valid key") } + +func TestPDErrorCanRetry(t *testing.T) { + // non-gRPC error should not retry + err := errors.New("random failure") + require.False(t, PdErrorCanRetry(err)) + + e1 := status.Error(codes.Unknown, "region 42 is not fully replicated") + require.True(t, PdErrorCanRetry(e1)) + + e2 := status.Error(codes.Unknown, "operator canceled because cannot add an operator to the execute queue") + require.True(t, PdErrorCanRetry(e2)) + + e3 := status.Error(codes.Unknown, "unable to create operator, failed to create scatter region operator for region 13813282") + require.True(t, PdErrorCanRetry(e3)) + + e4 := status.Error(codes.Unknown, "should be false") + require.False(t, PdErrorCanRetry(e4)) +} From 81a194ca239d54b360f63ef7c67c1f76ce5652be Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 14 Jul 2025 16:27:06 +0800 Subject: [PATCH 03/16] fix br test --- br/tests/br_replica_read/run.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index ba6427b3683e1..ac66bc9a89452 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -29,6 +29,9 @@ random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|sel echo "random store id: $random_store_id" run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' 'read_only' +run_pd_ctl -u https://$PD_ADDR config set replication.max-replicas 2 +trap 'run_pd_ctl -u https://$PD_ADDR config set replication.max-replicas 3' EXIT + # set placement rule to add a learner replica for each region in the read only store run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle load --out=$TEST_DIR/default_rules.json cat $CUR/placement_rule_with_learner_template.json | jq ".[].rules[0].count = $VOTER_COUNT" > $TEST_DIR/placement_rule_with_learner.json From 8f619b4815bd63ecae39a48538cf18b17efabc47 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 14 Jul 2025 17:07:51 +0800 Subject: [PATCH 04/16] Revert "fix br test" This reverts commit 81a194ca239d54b360f63ef7c67c1f76ce5652be. --- br/tests/br_replica_read/run.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index ac66bc9a89452..ba6427b3683e1 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -29,9 +29,6 @@ random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|sel echo "random store id: $random_store_id" run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' 'read_only' -run_pd_ctl -u https://$PD_ADDR config set replication.max-replicas 2 -trap 'run_pd_ctl -u https://$PD_ADDR config set replication.max-replicas 3' EXIT - # set placement rule to add a learner replica for each region in the read only store run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle load --out=$TEST_DIR/default_rules.json cat $CUR/placement_rule_with_learner_template.json | jq ".[].rules[0].count = $VOTER_COUNT" > $TEST_DIR/placement_rule_with_learner.json From ccf5f56c90203914e2271a707581eb23eea6612c Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 14 Jul 2025 17:08:45 +0800 Subject: [PATCH 05/16] fix br test --- br/tests/br_replica_read/run.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index ba6427b3683e1..06896bb9b2c2c 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -60,6 +60,9 @@ run_br -u https://$PD_ADDR backup db --db "$DB" -s "local://$TEST_DIR/$DB" --rep run_sql "DROP DATABASE $DB;" +run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' +run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json + # restore db echo "restore start..." run_br restore db --db $DB -s "local://$TEST_DIR/$DB" -u https://$PD_ADDR @@ -85,6 +88,4 @@ echo "testing DDL query..." run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE TABLE' run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE DATABASE' -run_sql "DROP DATABASE $DB;" -run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' -run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json \ No newline at end of file +run_sql "DROP DATABASE $DB;" \ No newline at end of file From 2f0bd18e47ee3e9d5e97dcd30a156db05c35667a Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 14 Jul 2025 18:33:33 +0800 Subject: [PATCH 06/16] debug --- br/tests/br_replica_read/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index 06896bb9b2c2c..7cdcd22f2e0d6 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -63,6 +63,8 @@ run_sql "DROP DATABASE $DB;" run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json +sleep 60 + # restore db echo "restore start..." run_br restore db --db $DB -s "local://$TEST_DIR/$DB" -u https://$PD_ADDR From 54dbf9f1b74337a578b110060514adcf6354f42a Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 14 Jul 2025 18:38:23 +0800 Subject: [PATCH 07/16] debug --- br/pkg/restore/split/client.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/br/pkg/restore/split/client.go b/br/pkg/restore/split/client.go index 8da0d18b00887..18161afa71922 100644 --- a/br/pkg/restore/split/client.go +++ b/br/pkg/restore/split/client.go @@ -838,6 +838,11 @@ func (c *pdClient) scatterRegionsSequentially(ctx context.Context, newRegions [] logutil.Region(region.Region), ) delete(newRegionSet, region.Region.Id) + } else { + log.Warn("scatter region meet error, will retry", + logutil.ShortError(err), + logutil.Region(region.Region), + ) } errs = multierr.Append(errs, err) } From 9c74ace214f28f441a90b457de80d52440fd60fd Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 15 Jul 2025 11:07:10 +0800 Subject: [PATCH 08/16] debug --- br/tests/br_replica_read/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index 7cdcd22f2e0d6..ae81602aef63b 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -24,6 +24,8 @@ if [ "$VOTER_COUNT" -lt "1" ];then exit 0 fi +trap 'cat "$TEST_DIR/pd.log"' ERR + # set random store to read only random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|select(.store.labels|(.!= null and any(.key == "engine" and .value=="tiflash"))| not)|.store.id)') echo "random store id: $random_store_id" From 50aab65e808995e72a4dd1dd850d4f7ef8b504c8 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 15 Jul 2025 12:16:39 +0800 Subject: [PATCH 09/16] debug --- br/tests/br_replica_read/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index ae81602aef63b..870a051d8a9b2 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -25,6 +25,7 @@ if [ "$VOTER_COUNT" -lt "1" ];then fi trap 'cat "$TEST_DIR/pd.log"' ERR +tail -f "$TEST_DIR/pd.log" & # set random store to read only random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|select(.store.labels|(.!= null and any(.key == "engine" and .value=="tiflash"))| not)|.store.id)') From d5375f7f88b5eb2343f9f24e00da30f3e3fc63cf Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 15 Jul 2025 13:53:23 +0800 Subject: [PATCH 10/16] revert --- br/tests/br_replica_read/run.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index 870a051d8a9b2..ba6427b3683e1 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -24,9 +24,6 @@ if [ "$VOTER_COUNT" -lt "1" ];then exit 0 fi -trap 'cat "$TEST_DIR/pd.log"' ERR -tail -f "$TEST_DIR/pd.log" & - # set random store to read only random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|select(.store.labels|(.!= null and any(.key == "engine" and .value=="tiflash"))| not)|.store.id)') echo "random store id: $random_store_id" @@ -63,11 +60,6 @@ run_br -u https://$PD_ADDR backup db --db "$DB" -s "local://$TEST_DIR/$DB" --rep run_sql "DROP DATABASE $DB;" -run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' -run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json - -sleep 60 - # restore db echo "restore start..." run_br restore db --db $DB -s "local://$TEST_DIR/$DB" -u https://$PD_ADDR @@ -93,4 +85,6 @@ echo "testing DDL query..." run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE TABLE' run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE DATABASE' -run_sql "DROP DATABASE $DB;" \ No newline at end of file +run_sql "DROP DATABASE $DB;" +run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' +run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json \ No newline at end of file From 6509ad9fe93e49309cb1d3020b784b333bff9554 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 16 Jul 2025 09:53:15 +0800 Subject: [PATCH 11/16] change log level --- br/tests/config/pd.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/br/tests/config/pd.toml b/br/tests/config/pd.toml index 3adbcc89b320a..a00c84f3ddb9a 100644 --- a/br/tests/config/pd.toml +++ b/br/tests/config/pd.toml @@ -5,3 +5,6 @@ enable-placement-rules = true cacert-path = "/tmp/backup_restore_test/certs/ca.pem" cert-path = "/tmp/backup_restore_test/certs/pd.pem" key-path = "/tmp/backup_restore_test/certs/pd.key" + +[log] +level = "debug" \ No newline at end of file From 1247720cde1ed5f8e87c13705e6d8e47a58fa477 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 16 Jul 2025 12:18:34 +0800 Subject: [PATCH 12/16] replace $mode with mode --- br/tests/br_replica_read/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/br/tests/br_replica_read/run.sh b/br/tests/br_replica_read/run.sh index ba6427b3683e1..66ab28c8547e6 100755 --- a/br/tests/br_replica_read/run.sh +++ b/br/tests/br_replica_read/run.sh @@ -27,7 +27,7 @@ fi # set random store to read only random_store_id=$(run_pd_ctl -u https://$PD_ADDR store | jq 'first(.stores[]|select(.store.labels|(.!= null and any(.key == "engine" and .value=="tiflash"))| not)|.store.id)') echo "random store id: $random_store_id" -run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' 'read_only' +run_pd_ctl -u https://$PD_ADDR store label $random_store_id 'mode' 'read_only' # set placement rule to add a learner replica for each region in the read only store run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle load --out=$TEST_DIR/default_rules.json @@ -56,7 +56,7 @@ run_sql "INSERT INTO $DB.usertable2 VALUES (\"c\", \"d\");" # backup db echo "backup start..." -run_br -u https://$PD_ADDR backup db --db "$DB" -s "local://$TEST_DIR/$DB" --replica-read-label '$mode:read_only' +run_br -u https://$PD_ADDR backup db --db "$DB" -s "local://$TEST_DIR/$DB" --replica-read-label 'mode:read_only' run_sql "DROP DATABASE $DB;" @@ -86,5 +86,5 @@ run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE run_curl https://$TIDB_STATUS_ADDR/ddl/history | grep -E '/\*from\(br\)\*/CREATE DATABASE' run_sql "DROP DATABASE $DB;" -run_pd_ctl -u https://$PD_ADDR store label $random_store_id '$mode' '' +run_pd_ctl -u https://$PD_ADDR store label $random_store_id 'mode' '' run_pd_ctl -u https://$PD_ADDR config placement-rules rule-bundle save --in $TEST_DIR/default_rules.json \ No newline at end of file From 89f55458b54d3dab3462f719eb211b5be69c0173 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 16 Jul 2025 12:47:01 +0800 Subject: [PATCH 13/16] Revert "change log level" This reverts commit 6509ad9fe93e49309cb1d3020b784b333bff9554. --- br/tests/config/pd.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/br/tests/config/pd.toml b/br/tests/config/pd.toml index a00c84f3ddb9a..3adbcc89b320a 100644 --- a/br/tests/config/pd.toml +++ b/br/tests/config/pd.toml @@ -5,6 +5,3 @@ enable-placement-rules = true cacert-path = "/tmp/backup_restore_test/certs/ca.pem" cert-path = "/tmp/backup_restore_test/certs/pd.pem" key-path = "/tmp/backup_restore_test/certs/pd.key" - -[log] -level = "debug" \ No newline at end of file From cf3a4d1d3c3eedf8d6d51c81174680047f40f4ef Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 16 Jul 2025 09:53:15 +0800 Subject: [PATCH 14/16] change log level and enable BR Log Signed-off-by: Juncen Yu --- br/tests/br_tidb_placement_policy/run.sh | 2 +- br/tests/config/pd.toml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/br/tests/br_tidb_placement_policy/run.sh b/br/tests/br_tidb_placement_policy/run.sh index dbc81d6f60a9d..3ee20460b1071 100644 --- a/br/tests/br_tidb_placement_policy/run.sh +++ b/br/tests/br_tidb_placement_policy/run.sh @@ -125,7 +125,7 @@ run_sql "create table $DB.sbtest(id int primary key, k int not null, c char(120) run_sql "insert into $DB.sbtest values ($i, $i, '$i', '$i');" # backup table and policies -run_br backup full -s "local://$TEST_DIR/${DB}_related" --pd $PD_ADDR +BR_LOG_TO_TERM=1 run_br backup full -s "local://$TEST_DIR/${DB}_related" --pd $PD_ADDR # clear data and policies for restore. run_sql "DROP DATABASE $DB;" diff --git a/br/tests/config/pd.toml b/br/tests/config/pd.toml index 3adbcc89b320a..a00c84f3ddb9a 100644 --- a/br/tests/config/pd.toml +++ b/br/tests/config/pd.toml @@ -5,3 +5,6 @@ enable-placement-rules = true cacert-path = "/tmp/backup_restore_test/certs/ca.pem" cert-path = "/tmp/backup_restore_test/certs/pd.pem" key-path = "/tmp/backup_restore_test/certs/pd.key" + +[log] +level = "debug" \ No newline at end of file From 2b07b9545b03ff01976063b3ebdc1376c9e57dda Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 16 Jul 2025 14:39:55 +0800 Subject: [PATCH 15/16] use a 6 tikv cluster to get rid of replication not enough Signed-off-by: Juncen Yu --- tests/_utils/run_services | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_utils/run_services b/tests/_utils/run_services index 8f8a31caf7f96..0d6e111415199 100644 --- a/tests/_utils/run_services +++ b/tests/_utils/run_services @@ -26,7 +26,7 @@ export TIDB_STATUS_ADDR="127.0.0.1:10080" # actual tikv_addr are TIKV_ADDR${i} export TIKV_ADDR="127.0.0.1:2016" export TIKV_STATUS_ADDR="127.0.0.1:2018" -export TIKV_COUNT=3 +export TIKV_COUNT=6 export TIFLASH_HTTP="127.0.0.1:20292" export TIKV_PIDS="${TEST_DIR:?}/tikv_pids.txt" From aa7abd1dda62b5ceeab957a722688a342ba8134b Mon Sep 17 00:00:00 2001 From: Juncen Yu Date: Wed, 16 Jul 2025 15:06:03 +0800 Subject: [PATCH 16/16] Revert "change log level and enable BR Log" This reverts commit cf3a4d1d3c3eedf8d6d51c81174680047f40f4ef. Signed-off-by: Juncen Yu --- br/tests/br_tidb_placement_policy/run.sh | 2 +- br/tests/config/pd.toml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/br/tests/br_tidb_placement_policy/run.sh b/br/tests/br_tidb_placement_policy/run.sh index 3ee20460b1071..dbc81d6f60a9d 100644 --- a/br/tests/br_tidb_placement_policy/run.sh +++ b/br/tests/br_tidb_placement_policy/run.sh @@ -125,7 +125,7 @@ run_sql "create table $DB.sbtest(id int primary key, k int not null, c char(120) run_sql "insert into $DB.sbtest values ($i, $i, '$i', '$i');" # backup table and policies -BR_LOG_TO_TERM=1 run_br backup full -s "local://$TEST_DIR/${DB}_related" --pd $PD_ADDR +run_br backup full -s "local://$TEST_DIR/${DB}_related" --pd $PD_ADDR # clear data and policies for restore. run_sql "DROP DATABASE $DB;" diff --git a/br/tests/config/pd.toml b/br/tests/config/pd.toml index a00c84f3ddb9a..3adbcc89b320a 100644 --- a/br/tests/config/pd.toml +++ b/br/tests/config/pd.toml @@ -5,6 +5,3 @@ enable-placement-rules = true cacert-path = "/tmp/backup_restore_test/certs/ca.pem" cert-path = "/tmp/backup_restore_test/certs/pd.pem" key-path = "/tmp/backup_restore_test/certs/pd.key" - -[log] -level = "debug" \ No newline at end of file