Skip to content

Commit

Permalink
br: stop log when full restore failed (#51578) (#52375)
Browse files Browse the repository at this point in the history
close #51572
  • Loading branch information
ti-chi-bot authored Apr 17, 2024
1 parent a505558 commit 853d735
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 26 deletions.
1 change: 1 addition & 0 deletions br/pkg/conn/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@ go_test(
"@org_golang_google_grpc//codes",
"@org_golang_google_grpc//status",
"@org_uber_go_goleak//:goleak",
"@org_uber_go_multierr//:multierr",
],
)
8 changes: 6 additions & 2 deletions br/pkg/conn/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,20 @@ func GetAllTiKVStoresWithRetry(ctx context.Context,
func() error {
stores, err = util.GetAllTiKVStores(ctx, pdClient, storeBehavior)
failpoint.Inject("hint-GetAllTiKVStores-error", func(val failpoint.Value) {
logutil.CL(ctx).Debug("failpoint hint-GetAllTiKVStores-error injected.")
if val.(bool) {
logutil.CL(ctx).Debug("failpoint hint-GetAllTiKVStores-error injected.")
err = status.Error(codes.Unknown, "Retryable error")
} else {
err = context.Canceled
}
})

failpoint.Inject("hint-GetAllTiKVStores-cancel", func(val failpoint.Value) {
logutil.CL(ctx).Debug("failpoint hint-GetAllTiKVStores-cancel injected.")
if val.(bool) {
logutil.CL(ctx).Debug("failpoint hint-GetAllTiKVStores-cancel injected.")
err = status.Error(codes.Canceled, "Cancel Retry")
} else {
err = context.Canceled
}
})

Expand Down
21 changes: 16 additions & 5 deletions br/pkg/conn/conn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ import (
"github.com/pingcap/tidb/br/pkg/pdutil"
"github.com/pingcap/tidb/br/pkg/utils"
"github.com/stretchr/testify/require"
"go.uber.org/multierr"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

func TestGetAllTiKVStoresWithRetryCancel(t *testing.T) {
_ = failpoint.Enable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-cancel", "return(true)")
_ = failpoint.Enable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-cancel",
"1*return(true)->1*return(false)")
defer func() {
_ = failpoint.Disable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-cancel")
}()
Expand Down Expand Up @@ -59,11 +61,14 @@ func TestGetAllTiKVStoresWithRetryCancel(t *testing.T) {

_, err := GetAllTiKVStoresWithRetry(ctx, fpdc, util.SkipTiFlash)
require.Error(t, err)
require.Equal(t, codes.Canceled, status.Code(errors.Cause(err)))
errs := multierr.Errors(err)
require.Equal(t, 2, len(errs))
require.Equal(t, codes.Canceled, status.Code(errors.Cause(errs[0])))
}

func TestGetAllTiKVStoresWithUnknown(t *testing.T) {
_ = failpoint.Enable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-error", "return(true)")
_ = failpoint.Enable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-error",
"1*return(true)->1*return(false)")
defer func() {
_ = failpoint.Disable("github.com/pingcap/tidb/br/pkg/conn/hint-GetAllTiKVStores-error")
}()
Expand Down Expand Up @@ -99,7 +104,9 @@ func TestGetAllTiKVStoresWithUnknown(t *testing.T) {

_, err := GetAllTiKVStoresWithRetry(ctx, fpdc, util.SkipTiFlash)
require.Error(t, err)
require.Equal(t, codes.Unknown, status.Code(errors.Cause(err)))
errs := multierr.Errors(err)
require.Equal(t, 2, len(errs))
require.Equal(t, codes.Unknown, status.Code(errors.Cause(errs[0])))
}
func TestCheckStoresAlive(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
Expand Down Expand Up @@ -397,14 +404,18 @@ func TestGetMergeRegionSizeAndCount(t *testing.T) {
},
}

ctx := context.Background()
pctx := context.Background()
for _, ca := range cases {
ctx, cancel := context.WithCancel(pctx)
pdCli := utils.FakePDClient{Stores: ca.stores}
require.Equal(t, len(ca.content), len(ca.stores))
count := 0
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch strings.TrimSpace(r.URL.Path) {
case "/config":
if len(ca.content[count]) == 0 {
cancel()
}
_, _ = fmt.Fprint(w, ca.content[count])
default:
http.NotFoundHandler().ServeHTTP(w, r)
Expand Down
10 changes: 6 additions & 4 deletions br/pkg/restore/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1454,12 +1454,14 @@ LOOPFORTABLE:
rc.workerPool.ApplyOnErrorGroup(eg, func() error {
filesGroups := getGroupFiles(filesReplica, rc.fileImporter.supportMultiIngest)
for _, filesGroup := range filesGroups {
if importErr := func(fs []*backuppb.File) error {
if importErr := func(fs []*backuppb.File) (err error) {
fileStart := time.Now()
defer func() {
log.Info("import files done", logutil.Files(filesGroup),
zap.Duration("take", time.Since(fileStart)))
updateCh.Inc()
if err == nil {
log.Info("import files done", logutil.Files(filesGroup),
zap.Duration("take", time.Since(fileStart)))
updateCh.Inc()
}
}()
return rc.fileImporter.ImportSSTFiles(ectx, fs, rewriteRules, rc.cipher, rc.dom.Store().GetCodec().GetAPIVersion())
}(filesGroup); importErr != nil {
Expand Down
11 changes: 7 additions & 4 deletions br/pkg/utils/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ const (
backupSSTWaitInterval = 2 * time.Second
backupSSTMaxWaitInterval = 3 * time.Second

resetTSRetryTime = 16
resetTSRetryTime = 32
resetTSWaitInterval = 50 * time.Millisecond
resetTSMaxWaitInterval = 500 * time.Millisecond
resetTSMaxWaitInterval = 2 * time.Second

resetTSRetryTimeExt = 600
resetTSWaitIntervalExt = 500 * time.Millisecond
Expand Down Expand Up @@ -167,7 +167,6 @@ func NewBackupSSTBackoffer() Backoffer {
}

func (bo *importerBackoffer) NextBackoff(err error) time.Duration {
log.Warn("retry to import ssts", zap.Int("attempt", bo.attempt), zap.Error(err))
// we don't care storeID here.
res := bo.errContext.HandleErrorMsg(err.Error(), 0)
if res.Strategy == RetryStrategy {
Expand Down Expand Up @@ -249,8 +248,12 @@ func (bo *pdReqBackoffer) NextBackoff(err error) time.Duration {
bo.delayTime = 2 * bo.delayTime
bo.attempt--
default:
// If the connection timeout, pd client would cancel the context, and return grpc context cancel error.
// So make the codes.Canceled retryable too.
// It's OK to retry the grpc context cancel error, because the parent context cancel returns context.Canceled.
// For example, cancel the `ectx` and then pdClient.GetTS(ectx) returns context.Canceled instead of grpc context canceled.
switch status.Code(e) {
case codes.DeadlineExceeded, codes.NotFound, codes.AlreadyExists, codes.PermissionDenied, codes.ResourceExhausted, codes.Aborted, codes.OutOfRange, codes.Unavailable, codes.DataLoss, codes.Unknown:
case codes.DeadlineExceeded, codes.Canceled, codes.NotFound, codes.AlreadyExists, codes.PermissionDenied, codes.ResourceExhausted, codes.Aborted, codes.OutOfRange, codes.Unavailable, codes.DataLoss, codes.Unknown:
bo.delayTime = 2 * bo.delayTime
bo.attempt--
default:
Expand Down
16 changes: 5 additions & 11 deletions br/pkg/utils/backoff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,26 +123,20 @@ func TestPdBackoffWithRetryableError(t *testing.T) {
if counter == 2 {
return io.EOF
}
if counter == 6 {
return context.Canceled
}
return gRPCError
}, backoffer)
require.Equal(t, 16, counter)
require.Equal(t, 7, counter)
require.Equal(t, []error{
gRPCError,
gRPCError,
io.EOF,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
gRPCError,
context.Canceled,
}, multierr.Errors(err))
}

Expand Down

0 comments on commit 853d735

Please sign in to comment.