Skip to content

Commit

Permalink
ruler: don't retry on non-retriable error (grafana#7216)
Browse files Browse the repository at this point in the history
* ruler: don't retry on non-retriable error

Fix 6609

Signed-off-by: Vladimir Varankin <vladimir.varankin@grafana.com>

* Update CHANGELOG.md

Co-authored-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>

---------

Signed-off-by: Vladimir Varankin <vladimir.varankin@grafana.com>
Co-authored-by: Dimitar Dimitrov <dimitar.dimitrov@grafana.com>
  • Loading branch information
2 people authored and beatkind committed Feb 13, 2024
1 parent 1d2d2a7 commit c9c074b
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 87 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
* [BUGFIX] Fix panic during tsdb Commit #6766
* [BUGFIX] tsdb/head: wlog exemplars after samples #6766
* [BUGFIX] Ruler: fix issue where "failed to remotely evaluate query expression, will retry" messages are logged without context such as the trace ID and do not appear in trace events. #6789
* [BUGFIX] Ruler: do not retry requests to remote querier when server's response exceeds its configured max payload size. #7216
* [BUGFIX] Querier: fix issue where spans in query request traces were not nested correctly. #6893
* [BUGFIX] Fix issue where all incoming HTTP requests have duplicate trace spans. #6920
* [BUGFIX] Querier: do not retry requests to store-gateway when a query gets canceled. #6934
Expand Down
23 changes: 19 additions & 4 deletions pkg/ruler/remotequerier.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/prometheus/prometheus/promql"
"golang.org/x/exp/slices"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"

"github.com/grafana/mimir/pkg/querier/api"
"github.com/grafana/mimir/pkg/util/spanlogger"
Expand Down Expand Up @@ -308,11 +309,25 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque
}
return resp, nil
}
// 4xx errors shouldn't be retried because it is expected that
// running the same query gives rise to the same 4xx error.
if code := grpcutil.ErrorToStatusCode(err); code/100 == 4 {
return nil, err

// Bail out if the error is known to be not retriable.
switch code := grpcutil.ErrorToStatusCode(err); code {
case codes.ResourceExhausted:
// In case the server is configured with "grpc-max-send-msg-size-bytes",
// and the response exceeds this limit, there is no point retrying the request.
// This is a special case, refer to grafana/mimir#7216.
if strings.Contains(err.Error(), "message larger than max") {
return nil, err
}
default:
// In case the error was a wrapped HTTPResponse, its code represents HTTP status;
// 4xx errors shouldn't be retried because it is expected that
// running the same query gives rise to the same 4xx error.
if code/100 == 4 {
return nil, err
}
}

if !retry.Ongoing() {
return nil, err
}
Expand Down
112 changes: 29 additions & 83 deletions pkg/ruler/remotequerier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,26 +185,22 @@ func TestRemoteQuerier_Query(t *testing.T) {
})
}

func TestRemoteQuerier_SendRequest(t *testing.T) {
func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) {
const errMsg = "this is an error"
var (
successfulResponse = &httpgrpc.HTTPResponse{
Code: http.StatusOK,
Headers: []*httpgrpc.Header{
{Key: "Content-Type", Values: []string{"application/json"}},
},
Body: []byte(`{
"status": "success","data": {"resultType":"vector","result":[]}
}`),
}
erroneousResponse = &httpgrpc.HTTPResponse{
Code: http.StatusBadRequest,
Headers: []*httpgrpc.Header{
{Key: "Content-Type", Values: []string{"application/json"}},
},
Body: []byte("this is an error"),
}
)
successfulResponse := &httpgrpc.HTTPResponse{
Code: http.StatusOK,
Headers: []*httpgrpc.Header{
{Key: "Content-Type", Values: []string{"application/json"}},
},
Body: []byte(`{"status": "success","data": {"resultType":"vector","result":[]}}`),
}
erroneousResponse := &httpgrpc.HTTPResponse{
Code: http.StatusBadRequest,
Headers: []*httpgrpc.Header{
{Key: "Content-Type", Values: []string{"application/json"}},
},
Body: []byte("this is an error"),
}

tests := map[string]struct {
response *httpgrpc.HTTPResponse
Expand Down Expand Up @@ -237,6 +233,16 @@ func TestRemoteQuerier_SendRequest(t *testing.T) {
expectedError: status.Error(codes.DeadlineExceeded, context.DeadlineExceeded.Error()),
expectedRetries: true,
},
"gRPC ResourceExhausted error is retried": {
err: status.Error(codes.ResourceExhausted, errMsg),
expectedError: status.Error(codes.ResourceExhausted, errMsg),
expectedRetries: true,
},
"errors about execeeding gRPC server's limit are not retried": {
err: status.Error(codes.ResourceExhausted, "trying to send message larger than max"),
expectedError: status.Error(codes.ResourceExhausted, "trying to send message larger than max"),
expectedRetries: false,
},
"errors with code 4xx are not retried": {
err: httpgrpc.Errorf(http.StatusBadRequest, errMsg),
expectedError: httpgrpc.Errorf(http.StatusBadRequest, errMsg),
Expand All @@ -257,10 +263,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) {
}
for testName, testCase := range tests {
t.Run(testName, func(t *testing.T) {
var (
inReq *httpgrpc.HTTPRequest
count atomic.Int64
)
var count atomic.Int64

ctx, cancel := context.WithCancel(context.Background())
mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) {
Expand All @@ -275,7 +278,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) {
}
q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger())
require.Equal(t, int64(0), count.Load())
_, err := q.sendRequest(ctx, inReq, log.NewNopLogger())
_, err := q.Query(ctx, "qs", time.Now())
if testCase.err == nil {
if testCase.expectedError == nil {
require.NoError(t, err)
Expand Down Expand Up @@ -717,63 +720,6 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) {
require.Error(t, err)
}

func TestRemoteQuerier_BackoffRetry(t *testing.T) {
tcs := map[string]struct {
failedRequests int
expectedError string
requestDeadline time.Duration
}{
"succeed on failed requests <= max retries": {
failedRequests: maxRequestRetries,
},
"fail on failed requests > max retries": {
failedRequests: maxRequestRetries + 1,
expectedError: "failed request: 4",
},
"return last known error on context cancellation": {
failedRequests: 1,
requestDeadline: 50 * time.Millisecond, // force context cancellation while waiting for retry
expectedError: "context deadline exceeded while retrying request, last err was: failed request: 1",
},
}
for tn, tc := range tcs {
t.Run(tn, func(t *testing.T) {
retries := 0
mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) {
retries++
if retries <= tc.failedRequests {
return nil, fmt.Errorf("failed request: %d", retries)
}
return &httpgrpc.HTTPResponse{
Code: http.StatusOK,
Headers: []*httpgrpc.Header{
{Key: "Content-Type", Values: []string{"application/json"}},
},
Body: []byte(`{
"status": "success","data": {"resultType":"vector","result":[{"metric":{"foo":"bar"},"value":[1,"773054.5916666666"]}]}
}`),
}, nil
}
q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger())

ctx := context.Background()
if tc.requestDeadline > 0 {
var cancelFn context.CancelFunc
ctx, cancelFn = context.WithTimeout(ctx, tc.requestDeadline)
defer cancelFn()
}

resp, err := q.Query(ctx, "qs", time.Now())
if tc.expectedError != "" {
require.EqualError(t, err, tc.expectedError)
} else {
require.NotNil(t, resp)
require.Len(t, resp, 1)
}
})
}
}

func TestRemoteQuerier_StatusErrorResponses(t *testing.T) {
var (
errorResp = &httpgrpc.HTTPResponse{
Expand All @@ -785,8 +731,8 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) {
"status": "error","errorType": "execution"
}`),
}
error4xx = status.Error(http.StatusUnprocessableEntity, "this is a 4xx error")
error5xx = status.Error(http.StatusInternalServerError, "this is a 5xx error")
error4xx = httpgrpc.Errorf(http.StatusUnprocessableEntity, "this is a 4xx error")
error5xx = httpgrpc.Errorf(http.StatusInternalServerError, "this is a 5xx error")
)
testCases := map[string]struct {
resp *httpgrpc.HTTPResponse
Expand Down

0 comments on commit c9c074b

Please sign in to comment.