From 0adc6117828202a56a0cf45f717cd4283aed3cd2 Mon Sep 17 00:00:00 2001 From: ti-srebot <66930949+ti-srebot@users.noreply.github.com> Date: Mon, 30 May 2022 20:38:27 +0800 Subject: [PATCH] backup: retry on internal error and make more errors can be retried (#34352) (#34378) close pingcap/tidb#34350 --- br/pkg/backup/client.go | 27 +++++++++++++++++++++------ br/pkg/utils/retry.go | 1 + br/tests/br_full/run.sh | 4 ++-- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/br/pkg/backup/client.go b/br/pkg/backup/client.go index e4584df29f83a..ad26ba4b2d5c3 100644 --- a/br/pkg/backup/client.go +++ b/br/pkg/backup/client.go @@ -934,9 +934,17 @@ backupLoop: }) bcli, err := client.Backup(ctx, &req) failpoint.Inject("reset-retryable-error", func(val failpoint.Value) { - if val.(bool) { - logutil.CL(ctx).Debug("failpoint reset-retryable-error injected.") - err = status.Error(codes.Unavailable, "Unavailable error") + switch val.(string) { + case "Unavaiable": + { + logutil.CL(ctx).Debug("failpoint reset-retryable-error unavailable injected.") + err = status.Error(codes.Unavailable, "Unavailable error") + } + case "Internal": + { + logutil.CL(ctx).Debug("failpoint reset-retryable-error internal injected.") + err = status.Error(codes.Internal, "Internal error") + } } }) failpoint.Inject("reset-not-retryable-error", func(val failpoint.Value) { @@ -1002,9 +1010,15 @@ const ( // isRetryableError represents whether we should retry reset grpc connection. func isRetryableError(err error) bool { - - if status.Code(err) == codes.Unavailable { - return true + // some errors can be retried + // https://github.com/pingcap/tidb/issues/34350 + switch status.Code(err) { + case codes.Unavailable, codes.DeadlineExceeded, + codes.ResourceExhausted, codes.Aborted, codes.Internal: + { + log.Warn("backup met some errors, these errors can be retry 5 times", zap.Error(err)) + return true + } } // At least, there are two possible cancel() call, @@ -1012,6 +1026,7 @@ func isRetryableError(err error) bool { if status.Code(err) == codes.Canceled { if s, ok := status.FromError(err); ok { if strings.Contains(s.Message(), gRPC_Cancel) { + log.Warn("backup met grpc cancel error, this errors can be retry 5 times", zap.Error(err)) return true } } diff --git a/br/pkg/utils/retry.go b/br/pkg/utils/retry.go index 51a833d8d136c..bda305aaf11ac 100644 --- a/br/pkg/utils/retry.go +++ b/br/pkg/utils/retry.go @@ -31,6 +31,7 @@ var retryableServerError = []string{ "body write aborted", "error during dispatch", "put object timeout", + "internalerror", } // RetryableFunc presents a retryable operation. diff --git a/br/tests/br_full/run.sh b/br/tests/br_full/run.sh index 0d15794788b26..21f2e2143a002 100755 --- a/br/tests/br_full/run.sh +++ b/br/tests/br_full/run.sh @@ -30,7 +30,7 @@ done # backup full and kill tikv to test reset connection echo "backup with limit start..." -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/reset-retryable-error=1*return(true)" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/reset-retryable-error=1*return(\"Unavailable\")->1*return(\"Internal\")" run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB-limit" --concurrency 4 export GO_FAILPOINTS="" @@ -49,7 +49,7 @@ fi # backup full echo "backup with lz4 start..." -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/backup-storage-error=1*return(\"connection refused\")" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/backup-storage-error=1*return(\"connection refused\")->1*return(\"InternalError\")" run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB-lz4" --concurrency 4 --compression lz4 export GO_FAILPOINTS="" size_lz4=$(du -d 0 $TEST_DIR/$DB-lz4 | awk '{print $1}')