diff --git a/.chloggen/signalfx-exp-retry-dim-update-without-tags.yaml b/.chloggen/signalfx-exp-retry-dim-update-without-tags.yaml new file mode 100644 index 000000000000..1fed01836004 --- /dev/null +++ b/.chloggen/signalfx-exp-retry-dim-update-without-tags.yaml @@ -0,0 +1,27 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: exporter/signalfx + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Enabling retrying for dimension properties update without tags in case of 400 response error. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [36044] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + Property and tag updates are done using the same API call. After this change, the exporter will retry once to sync + properties in case of 400 response error. + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/exporter/signalfxexporter/internal/dimensions/dimclient.go b/exporter/signalfxexporter/internal/dimensions/dimclient.go index ccfdf0c03fac..ff0337767d3b 100644 --- a/exporter/signalfxexporter/internal/dimensions/dimclient.go +++ b/exporter/signalfxexporter/internal/dimensions/dimclient.go @@ -237,30 +237,32 @@ func (dc *DimensionClient) handleDimensionUpdate(ctx context.Context, dimUpdate req = req.WithContext( context.WithValue(req.Context(), RequestFailedCallbackKey, RequestFailedCallback(func(statusCode int, err error) { - if statusCode >= 400 && statusCode < 500 && statusCode != 404 { - dc.logger.Error( - "Unable to update dimension, not retrying", - zap.Error(err), - zap.String("URL", sanitize.URL(req.URL)), - zap.String("dimensionUpdate", dimUpdate.String()), - zap.Int("statusCode", statusCode), - ) - - // Don't retry if it is a 4xx error (except 404) since these - // imply an input/auth error, which is not going to be remedied - // by retrying. - // 404 errors are special because they can occur due to races - // within the dimension patch endpoint. - return + retry := false + retryMsg := "not retrying" + if statusCode == 400 && len(dimUpdate.Tags) > 0 { + // It's possible that number of tags is too large. In this case, + // we should retry the request without tags to update the dimension properties at least. + dimUpdate.Tags = nil + retry = true + retryMsg = "retrying without tags" + } else if statusCode == 404 || statusCode >= 500 { + // Retry on 5xx server errors or 404s which can occur due to races within the dimension patch endpoint. + retry = true + retryMsg = "retrying" } dc.logger.Error( - "Unable to update dimension, retrying", + "Unable to update dimension, "+retryMsg, zap.Error(err), zap.String("URL", sanitize.URL(req.URL)), zap.String("dimensionUpdate", dimUpdate.String()), zap.Int("statusCode", statusCode), ) + + if !retry { + return + } + // The retry is meant to provide some measure of robustness against // temporary API failures. If the API is down for significant // periods of time, dimension updates will probably eventually back diff --git a/exporter/signalfxexporter/internal/dimensions/dimclient_test.go b/exporter/signalfxexporter/internal/dimensions/dimclient_test.go index fb84254029f6..e82fbb9dfcab 100644 --- a/exporter/signalfxexporter/internal/dimensions/dimclient_test.go +++ b/exporter/signalfxexporter/internal/dimensions/dimclient_test.go @@ -273,6 +273,63 @@ func TestDimensionClient(t *testing.T) { require.EqualValues(t, 2, server.requestCount.Load()) }) + t.Run("successful retry without tags on 400 response", func(t *testing.T) { + server.reset() + server.respCode = http.StatusBadRequest + + require.NoError(t, client.acceptDimension(&DimensionUpdate{ + Name: "AWSUniqueID", + Value: "abcd", + Properties: map[string]*string{ + "c": newString("d"), + }, + Tags: map[string]bool{ + "running": true, + }, + })) + server.handleRequest() + require.Empty(t, server.acceptedDims) + + // The next successful request should be sent without tags. + server.respCode = http.StatusOK + server.handleRequest() + require.Equal(t, []dim{ + { + Key: "AWSUniqueID", + Value: "abcd", + Properties: map[string]*string{ + "c": newString("d"), + }, + }, + }, server.acceptedDims) + require.EqualValues(t, 2, server.requestCount.Load()) + }) + + t.Run("retry without tags only once", func(t *testing.T) { + server.reset() + server.respCode = http.StatusBadRequest + + require.NoError(t, client.acceptDimension(&DimensionUpdate{ + Name: "AWSUniqueID", + Value: "abcd", + Properties: map[string]*string{ + "c": newString("d"), + }, + Tags: map[string]bool{ + "running": true, + }, + })) + server.handleRequest() + require.Empty(t, server.acceptedDims) + + // handle retry + server.handleRequest() + + // ensure no more retries + time.Sleep(100 * time.Millisecond) + require.EqualValues(t, 2, server.requestCount.Load()) + }) + t.Run("send successive quick updates to same dim", func(t *testing.T) { server.reset()