diff --git a/CHANGELOG.md b/CHANGELOG.md index 0214d05dcb0..999375c0009 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ * [BUGFIX] Fix panic during tsdb Commit #6766 * [BUGFIX] tsdb/head: wlog exemplars after samples #6766 * [BUGFIX] Ruler: fix issue where "failed to remotely evaluate query expression, will retry" messages are logged without context such as the trace ID and do not appear in trace events. #6789 +* [BUGFIX] Ruler: do not retry requests to remote querier when server's response exceeds its configured max payload size. #7216 * [BUGFIX] Querier: fix issue where spans in query request traces were not nested correctly. #6893 * [BUGFIX] Fix issue where all incoming HTTP requests have duplicate trace spans. #6920 * [BUGFIX] Querier: do not retry requests to store-gateway when a query gets canceled. #6934 diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 25e7c858d9d..a1d7fb08260 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -30,6 +30,7 @@ import ( "github.com/prometheus/prometheus/promql" "golang.org/x/exp/slices" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/querier/api" "github.com/grafana/mimir/pkg/util/spanlogger" @@ -308,11 +309,25 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque } return resp, nil } - // 4xx errors shouldn't be retried because it is expected that - // running the same query gives rise to the same 4xx error. - if code := grpcutil.ErrorToStatusCode(err); code/100 == 4 { - return nil, err + + // Bail out if the error is known to be not retriable. + switch code := grpcutil.ErrorToStatusCode(err); code { + case codes.ResourceExhausted: + // In case the server is configured with "grpc-max-send-msg-size-bytes", + // and the response exceeds this limit, there is no point retrying the request. + // This is a special case, refer to grafana/mimir#7216. + if strings.Contains(err.Error(), "message larger than max") { + return nil, err + } + default: + // In case the error was a wrapped HTTPResponse, its code represents HTTP status; + // 4xx errors shouldn't be retried because it is expected that + // running the same query gives rise to the same 4xx error. + if code/100 == 4 { + return nil, err + } } + if !retry.Ongoing() { return nil, err } diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 77bc2c7be7f..37decb97d9a 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -185,26 +185,22 @@ func TestRemoteQuerier_Query(t *testing.T) { }) } -func TestRemoteQuerier_SendRequest(t *testing.T) { +func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { const errMsg = "this is an error" - var ( - successfulResponse = &httpgrpc.HTTPResponse{ - Code: http.StatusOK, - Headers: []*httpgrpc.Header{ - {Key: "Content-Type", Values: []string{"application/json"}}, - }, - Body: []byte(`{ - "status": "success","data": {"resultType":"vector","result":[]} - }`), - } - erroneousResponse = &httpgrpc.HTTPResponse{ - Code: http.StatusBadRequest, - Headers: []*httpgrpc.Header{ - {Key: "Content-Type", Values: []string{"application/json"}}, - }, - Body: []byte("this is an error"), - } - ) + successfulResponse := &httpgrpc.HTTPResponse{ + Code: http.StatusOK, + Headers: []*httpgrpc.Header{ + {Key: "Content-Type", Values: []string{"application/json"}}, + }, + Body: []byte(`{"status": "success","data": {"resultType":"vector","result":[]}}`), + } + erroneousResponse := &httpgrpc.HTTPResponse{ + Code: http.StatusBadRequest, + Headers: []*httpgrpc.Header{ + {Key: "Content-Type", Values: []string{"application/json"}}, + }, + Body: []byte("this is an error"), + } tests := map[string]struct { response *httpgrpc.HTTPResponse @@ -237,6 +233,16 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { expectedError: status.Error(codes.DeadlineExceeded, context.DeadlineExceeded.Error()), expectedRetries: true, }, + "gRPC ResourceExhausted error is retried": { + err: status.Error(codes.ResourceExhausted, errMsg), + expectedError: status.Error(codes.ResourceExhausted, errMsg), + expectedRetries: true, + }, + "errors about execeeding gRPC server's limit are not retried": { + err: status.Error(codes.ResourceExhausted, "trying to send message larger than max"), + expectedError: status.Error(codes.ResourceExhausted, "trying to send message larger than max"), + expectedRetries: false, + }, "errors with code 4xx are not retried": { err: httpgrpc.Errorf(http.StatusBadRequest, errMsg), expectedError: httpgrpc.Errorf(http.StatusBadRequest, errMsg), @@ -257,10 +263,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { } for testName, testCase := range tests { t.Run(testName, func(t *testing.T) { - var ( - inReq *httpgrpc.HTTPRequest - count atomic.Int64 - ) + var count atomic.Int64 ctx, cancel := context.WithCancel(context.Background()) mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { @@ -275,7 +278,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { } q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) - _, err := q.sendRequest(ctx, inReq, log.NewNopLogger()) + _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { if testCase.expectedError == nil { require.NoError(t, err) @@ -717,63 +720,6 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { require.Error(t, err) } -func TestRemoteQuerier_BackoffRetry(t *testing.T) { - tcs := map[string]struct { - failedRequests int - expectedError string - requestDeadline time.Duration - }{ - "succeed on failed requests <= max retries": { - failedRequests: maxRequestRetries, - }, - "fail on failed requests > max retries": { - failedRequests: maxRequestRetries + 1, - expectedError: "failed request: 4", - }, - "return last known error on context cancellation": { - failedRequests: 1, - requestDeadline: 50 * time.Millisecond, // force context cancellation while waiting for retry - expectedError: "context deadline exceeded while retrying request, last err was: failed request: 1", - }, - } - for tn, tc := range tcs { - t.Run(tn, func(t *testing.T) { - retries := 0 - mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { - retries++ - if retries <= tc.failedRequests { - return nil, fmt.Errorf("failed request: %d", retries) - } - return &httpgrpc.HTTPResponse{ - Code: http.StatusOK, - Headers: []*httpgrpc.Header{ - {Key: "Content-Type", Values: []string{"application/json"}}, - }, - Body: []byte(`{ - "status": "success","data": {"resultType":"vector","result":[{"metric":{"foo":"bar"},"value":[1,"773054.5916666666"]}]} - }`), - }, nil - } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) - - ctx := context.Background() - if tc.requestDeadline > 0 { - var cancelFn context.CancelFunc - ctx, cancelFn = context.WithTimeout(ctx, tc.requestDeadline) - defer cancelFn() - } - - resp, err := q.Query(ctx, "qs", time.Now()) - if tc.expectedError != "" { - require.EqualError(t, err, tc.expectedError) - } else { - require.NotNil(t, resp) - require.Len(t, resp, 1) - } - }) - } -} - func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { var ( errorResp = &httpgrpc.HTTPResponse{ @@ -785,8 +731,8 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { "status": "error","errorType": "execution" }`), } - error4xx = status.Error(http.StatusUnprocessableEntity, "this is a 4xx error") - error5xx = status.Error(http.StatusInternalServerError, "this is a 5xx error") + error4xx = httpgrpc.Errorf(http.StatusUnprocessableEntity, "this is a 4xx error") + error5xx = httpgrpc.Errorf(http.StatusInternalServerError, "this is a 5xx error") ) testCases := map[string]struct { resp *httpgrpc.HTTPResponse