From 7921f0a5c58cfc1a7ee3f98ec8969381b9a7931b Mon Sep 17 00:00:00 2001 From: lance6716 Date: Tue, 7 Nov 2023 16:06:11 +0800 Subject: [PATCH 1/2] lightning: add timeout for "write" RPC Signed-off-by: lance6716 --- br/pkg/lightning/backend/local/region_job.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/br/pkg/lightning/backend/local/region_job.go b/br/pkg/lightning/backend/local/region_job.go index 1707d05f59843..8af6402a77264 100644 --- a/br/pkg/lightning/backend/local/region_job.go +++ b/br/pkg/lightning/backend/local/region_job.go @@ -219,6 +219,14 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error { failpoint.Return(err) }) + var cancel context.CancelFunc + // there are some strange blocking issues of gRPC like + // https://github.com/pingcap/tidb/issues/48352 + // https://github.com/pingcap/tidb/issues/46321 and I don't know why 😭, so we + // set a timeout here to avoid blocking forever. + ctx, cancel = context.WithTimeout(ctx, 15*time.Minute) + defer cancel() + apiVersion := local.tikvCodec.GetAPIVersion() clientFactory := local.importClientFactory kvBatchSize := local.KVWriteBatchSize From fd9fefdb4ac87e2cfa666b29751f614dc1f896ef Mon Sep 17 00:00:00 2001 From: lance6716 Date: Tue, 7 Nov 2023 16:46:15 +0800 Subject: [PATCH 2/2] address comment Signed-off-by: lance6716 --- br/pkg/lightning/backend/local/region_job.go | 6 +----- br/pkg/lightning/common/retry.go | 8 +++++++- br/pkg/lightning/common/retry_test.go | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/br/pkg/lightning/backend/local/region_job.go b/br/pkg/lightning/backend/local/region_job.go index 8af6402a77264..b328fc351d25c 100644 --- a/br/pkg/lightning/backend/local/region_job.go +++ b/br/pkg/lightning/backend/local/region_job.go @@ -220,11 +220,7 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error { }) var cancel context.CancelFunc - // there are some strange blocking issues of gRPC like - // https://github.com/pingcap/tidb/issues/48352 - // https://github.com/pingcap/tidb/issues/46321 and I don't know why 😭, so we - // set a timeout here to avoid blocking forever. - ctx, cancel = context.WithTimeout(ctx, 15*time.Minute) + ctx, cancel = context.WithTimeoutCause(ctx, 15*time.Minute, common.ErrWriteTooSlow) defer cancel() apiVersion := local.tikvCodec.GetAPIVersion() diff --git a/br/pkg/lightning/common/retry.go b/br/pkg/lightning/common/retry.go index 789c4f2d20a32..c44e2863ea874 100644 --- a/br/pkg/lightning/common/retry.go +++ b/br/pkg/lightning/common/retry.go @@ -90,13 +90,19 @@ var retryableErrorIDs = map[errors.ErrorID]struct{}{ drivererr.ErrUnknown.ID(): {}, } +// ErrWriteTooSlow is used to get rid of the gRPC blocking issue. +// there are some strange blocking issues of gRPC like +// https://github.com/pingcap/tidb/issues/48352 +// https://github.com/pingcap/tidb/issues/46321 and I don't know why 😭 +var ErrWriteTooSlow = errors.New("write too slow, maybe gRPC is blocked forever") + func isSingleRetryableError(err error) bool { err = errors.Cause(err) switch err { case nil, context.Canceled, context.DeadlineExceeded, io.EOF, sql.ErrNoRows: return false - case mysql.ErrInvalidConn, driver.ErrBadConn: + case mysql.ErrInvalidConn, driver.ErrBadConn, ErrWriteTooSlow: return true } diff --git a/br/pkg/lightning/common/retry_test.go b/br/pkg/lightning/common/retry_test.go index 114e500b3334c..af390e29e096b 100644 --- a/br/pkg/lightning/common/retry_test.go +++ b/br/pkg/lightning/common/retry_test.go @@ -35,6 +35,7 @@ import ( func TestIsRetryableError(t *testing.T) { require.False(t, IsRetryableError(context.Canceled)) require.False(t, IsRetryableError(context.DeadlineExceeded)) + require.True(t, IsRetryableError(ErrWriteTooSlow)) require.False(t, IsRetryableError(io.EOF)) require.False(t, IsRetryableError(&net.AddrError{})) require.False(t, IsRetryableError(&net.DNSError{}))