Skip to content

Commit

Permalink
lightning: add timeout for "write" RPC (#48355) (#48396)
Browse files Browse the repository at this point in the history
close #46321, close #48352
  • Loading branch information
ti-chi-bot authored Nov 22, 2023
1 parent 40f8cb8 commit b0d4953
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
18 changes: 16 additions & 2 deletions br/pkg/lightning/backend/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package local
import (
"bytes"
"context"
goerrors "errors"
"fmt"
"io"
"math"
Expand Down Expand Up @@ -907,17 +908,30 @@ type rangeStats struct {
// we don't need to do cleanup for the pairs written to tikv if encounters an error,
// tikv will takes the responsibility to do so.
func (local *local) WriteToTiKV(
ctx context.Context,
pCtx context.Context,
engine *Engine,
region *split.RegionInfo,
start, end []byte,
regionSplitSize int64,
regionSplitKeys int64,
) ([]*sst.SSTMeta, Range, rangeStats, error) {
) (s []*sst.SSTMeta, r Range, r2 rangeStats, errRet error) {
failpoint.Inject("WriteToTiKVNotEnoughDiskSpace", func(_ failpoint.Value) {
failpoint.Return(nil, Range{}, rangeStats{},
errors.Errorf("The available disk of TiKV (%s) only left %d, and capacity is %d", "", 0, 0))
})
ctx, cancel := context.WithTimeout(pCtx, 15*time.Minute)
defer cancel()
defer func() {
deadline, ok := ctx.Deadline()
if !ok {
// should not happen
return
}
if goerrors.Is(errRet, context.DeadlineExceeded) && time.Now().After(deadline) {
errRet = common.ErrWriteTooSlow
}
}()

if local.checkTiKVAvaliable {
for _, peer := range region.Region.GetPeers() {
var e error
Expand Down
8 changes: 7 additions & 1 deletion br/pkg/lightning/common/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,19 @@ var retryableErrorIDs = map[errors.ErrorID]struct{}{
drivererr.ErrUnknown.ID(): {},
}

// ErrWriteTooSlow is used to get rid of the gRPC blocking issue.
// there are some strange blocking issues of gRPC like
// https://github.com/pingcap/tidb/issues/48352
// https://github.com/pingcap/tidb/issues/46321 and I don't know why 😭
var ErrWriteTooSlow = errors.New("write too slow, maybe gRPC is blocked forever")

func isSingleRetryableError(err error) bool {
err = errors.Cause(err)

switch err {
case nil, context.Canceled, context.DeadlineExceeded, io.EOF, sql.ErrNoRows:
return false
case mysql.ErrInvalidConn, driver.ErrBadConn:
case mysql.ErrInvalidConn, driver.ErrBadConn, ErrWriteTooSlow:
return true
}

Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/common/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
func TestIsRetryableError(t *testing.T) {
require.False(t, IsRetryableError(context.Canceled))
require.False(t, IsRetryableError(context.DeadlineExceeded))
require.True(t, IsRetryableError(ErrWriteTooSlow))
require.False(t, IsRetryableError(io.EOF))
require.False(t, IsRetryableError(&net.AddrError{}))
require.False(t, IsRetryableError(&net.DNSError{}))
Expand Down

0 comments on commit b0d4953

Please sign in to comment.