diff --git a/br/pkg/lightning/backend/local/region_job.go b/br/pkg/lightning/backend/local/region_job.go index 1707d05f59843..b328fc351d25c 100644 --- a/br/pkg/lightning/backend/local/region_job.go +++ b/br/pkg/lightning/backend/local/region_job.go @@ -219,6 +219,10 @@ func (local *Backend) doWrite(ctx context.Context, j *regionJob) error { failpoint.Return(err) }) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeoutCause(ctx, 15*time.Minute, common.ErrWriteTooSlow) + defer cancel() + apiVersion := local.tikvCodec.GetAPIVersion() clientFactory := local.importClientFactory kvBatchSize := local.KVWriteBatchSize diff --git a/br/pkg/lightning/common/retry.go b/br/pkg/lightning/common/retry.go index 789c4f2d20a32..c44e2863ea874 100644 --- a/br/pkg/lightning/common/retry.go +++ b/br/pkg/lightning/common/retry.go @@ -90,13 +90,19 @@ var retryableErrorIDs = map[errors.ErrorID]struct{}{ drivererr.ErrUnknown.ID(): {}, } +// ErrWriteTooSlow is used to get rid of the gRPC blocking issue. +// there are some strange blocking issues of gRPC like +// https://github.com/pingcap/tidb/issues/48352 +// https://github.com/pingcap/tidb/issues/46321 and I don't know why 😭 +var ErrWriteTooSlow = errors.New("write too slow, maybe gRPC is blocked forever") + func isSingleRetryableError(err error) bool { err = errors.Cause(err) switch err { case nil, context.Canceled, context.DeadlineExceeded, io.EOF, sql.ErrNoRows: return false - case mysql.ErrInvalidConn, driver.ErrBadConn: + case mysql.ErrInvalidConn, driver.ErrBadConn, ErrWriteTooSlow: return true } diff --git a/br/pkg/lightning/common/retry_test.go b/br/pkg/lightning/common/retry_test.go index 114e500b3334c..af390e29e096b 100644 --- a/br/pkg/lightning/common/retry_test.go +++ b/br/pkg/lightning/common/retry_test.go @@ -35,6 +35,7 @@ import ( func TestIsRetryableError(t *testing.T) { require.False(t, IsRetryableError(context.Canceled)) require.False(t, IsRetryableError(context.DeadlineExceeded)) + require.True(t, IsRetryableError(ErrWriteTooSlow)) require.False(t, IsRetryableError(io.EOF)) require.False(t, IsRetryableError(&net.AddrError{})) require.False(t, IsRetryableError(&net.DNSError{}))