From d01896aea22aca5143df125109f6f8ffa4a7adec Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Fri, 7 Oct 2022 13:21:47 -0400 Subject: [PATCH 1/9] add status code label to the numTotalFailedNotifications metric Signed-off-by: Yijie Qin --- notify/notify.go | 13 +++++++--- notify/notify_test.go | 55 +++++++++++++++++++++++++++++++++++++++++++ notify/sns/sns.go | 5 +++- notify/util.go | 29 +++++++++++++++++++++++ 4 files changed, 98 insertions(+), 4 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index d6c741fe47..6d16172345 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -51,6 +51,9 @@ type Peer interface { // to a notification pipeline. const MinTimeout = 10 * time.Second +// defaultStatusCode is the default status code for numTotalFailedNotifications metric +const defaultStatusCode = "5xx" + // Notifier notifies about alerts under constraints of the given context. It // returns an error if unsuccessful and a flag whether the error is // recoverable. This information is useful for a retry logic. @@ -262,7 +265,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", - }, []string{"integration"}), + }, []string{"integration", "statusCode"}), numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -293,7 +296,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { "telegram", } { m.numNotifications.WithLabelValues(integration) - m.numTotalFailedNotifications.WithLabelValues(integration) + m.numTotalFailedNotifications.WithLabelValues(integration, "") m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) @@ -663,7 +666,11 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc() ctx, alerts, err := r.exec(ctx, l, alerts...) if err != nil { - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name()).Inc() + if e, ok := errors.Cause(err).(*ErrorWithStatusCode); ok { + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), getFailureStatusCodeCategory(e.StatusCode)).Inc() + } else { + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), defaultStatusCode).Inc() + } } return ctx, alerts, err } diff --git a/notify/notify_test.go b/notify/notify_test.go index 32bd29a1c3..7596f50936 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -24,6 +24,7 @@ import ( "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -422,6 +423,60 @@ func TestRetryStageWithError(t *testing.T) { require.NotNil(t, resctx) } +func TestRetryStageWithErrorCode(t *testing.T) { + testcases := map[string]struct { + errorcode int + codelabel string + expectedCount int + }{ + "for 400": {errorcode: 400, codelabel: "4xx", expectedCount: 1}, + "for 402": {errorcode: 402, codelabel: "4xx", expectedCount: 1}, + "for 500": {errorcode: 500, codelabel: "5xx", expectedCount: 1}, + "for 502": {errorcode: 502, codelabel: "5xx", expectedCount: 1}, + } + for _, testData := range testcases { + fail, retry := true, false + sent := []*types.Alert{} + testData := testData + i := Integration{ + name: "test", + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { + if fail { + fail = false + return retry, NewErrorWithStatusCode(testData.errorcode, errors.New("fail to deliver notification")) + } + sent = append(sent, alerts...) + return false, nil + }), + rs: sendResolved(false), + } + r := RetryStage{ + integration: i, + metrics: NewMetrics(prometheus.NewRegistry()), + } + + alerts := []*types.Alert{ + { + Alert: model.Alert{ + EndsAt: time.Now().Add(time.Hour), + }, + }, + } + + ctx := context.Background() + ctx = WithFiringAlerts(ctx, []uint64{0}) + + // Notify with a non-recoverable error. + resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...) + counter := r.metrics.numTotalFailedNotifications + + require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.codelabel)))) + + require.NotNil(t, err) + require.NotNil(t, resctx) + } +} + func TestRetryStageNoResolved(t *testing.T) { sent := []*types.Alert{} i := Integration{ diff --git a/notify/sns/sns.go b/notify/sns/sns.go index 1dfd42b4a5..c4221eaa10 100644 --- a/notify/sns/sns.go +++ b/notify/sns/sns.go @@ -83,7 +83,10 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err publishOutput, err := client.Publish(publishInput) if err != nil { if e, ok := err.(awserr.RequestFailure); ok { - return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + + statusErr := notify.NewErrorWithStatusCode(e.StatusCode(), error) + return retryable, statusErr } return true, err } diff --git a/notify/util.go b/notify/util.go index 0c5b675448..f0c9d86176 100644 --- a/notify/util.go +++ b/notify/util.go @@ -167,6 +167,17 @@ func readAll(r io.Reader) string { return string(bs) } +func getFailureStatusCodeCategory(statusCode int) string { + if statusCode/100 == 4 { + return "4xx" + } + if statusCode/100 == 5 { + return "5xx" + } + + return "" +} + // Retrier knows when to retry an HTTP request to a receiver. 2xx status codes // are successful, anything else is a failure and only 5xx status codes should // be retried. @@ -209,3 +220,21 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) { } return retry, errors.New(s) } + +type ErrorWithStatusCode struct { + Err error + + // The status code of the HTTP response. + StatusCode int +} + +func NewErrorWithStatusCode(statusCode int, err error) *ErrorWithStatusCode { + return &ErrorWithStatusCode{ + Err: err, + StatusCode: statusCode, + } +} + +func (e *ErrorWithStatusCode) Error() string { + return e.Err.Error() +} From 1fdaa834461276d9805b063f6f1278a98de30fc2 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Tue, 11 Oct 2022 22:38:48 -0400 Subject: [PATCH 2/9] getFailureStatusCodeCategory to return error rather than empty string when unknown status code Signed-off-by: Yijie Qin --- notify/notify.go | 14 +++++++++----- notify/util.go | 11 +++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index 6d16172345..ae7542687e 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -51,8 +51,8 @@ type Peer interface { // to a notification pipeline. const MinTimeout = 10 * time.Second -// defaultStatusCode is the default status code for numTotalFailedNotifications metric -const defaultStatusCode = "5xx" +// defaultStatusCodeCategory is the default status code category for numTotalFailedNotifications metric +const defaultStatusCodeCategory = "5xx" // Notifier notifies about alerts under constraints of the given context. It // returns an error if unsuccessful and a flag whether the error is @@ -665,13 +665,17 @@ func NewRetryStage(i Integration, groupName string, metrics *Metrics) *RetryStag func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc() ctx, alerts, err := r.exec(ctx, l, alerts...) + + statusCodeCategory := defaultStatusCodeCategory if err != nil { if e, ok := errors.Cause(err).(*ErrorWithStatusCode); ok { - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), getFailureStatusCodeCategory(e.StatusCode)).Inc() - } else { - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), defaultStatusCode).Inc() + result, interErr := getFailureStatusCodeCategory(e.StatusCode) + if interErr == nil { + statusCodeCategory = result + } } } + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), statusCodeCategory).Inc() return ctx, alerts, err } diff --git a/notify/util.go b/notify/util.go index f0c9d86176..b3a7a7cd5f 100644 --- a/notify/util.go +++ b/notify/util.go @@ -167,15 +167,18 @@ func readAll(r io.Reader) string { return string(bs) } -func getFailureStatusCodeCategory(statusCode int) string { +// getFailureStatusCodeCategory return the status code category for failure request +// the status starts with 4 will return 4xx and starts with 5 will return 5xx +// other than 4xx and 5xx input status will return an error. +func getFailureStatusCodeCategory(statusCode int) (string, error) { if statusCode/100 == 4 { - return "4xx" + return "4xx", nil } if statusCode/100 == 5 { - return "5xx" + return "5xx", nil } - return "" + return "", fmt.Errorf("unexpected status code %v", statusCode) } // Retrier knows when to retry an HTTP request to a receiver. 2xx status codes From 0e86d0fc2b206c812e607f445223b8a9b0e8f66b Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Wed, 12 Oct 2022 15:34:39 -0400 Subject: [PATCH 3/9] move the metric inc Signed-off-by: Yijie Qin --- notify/notify.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notify/notify.go b/notify/notify.go index ae7542687e..2fe4f12f96 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -674,8 +674,8 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale statusCodeCategory = result } } + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), statusCodeCategory).Inc() } - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), statusCodeCategory).Inc() return ctx, alerts, err } From 10e4d1c58dce448ff175a0180c373d9886baea05 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Mon, 31 Oct 2022 20:25:25 -0400 Subject: [PATCH 4/9] fix comments Signed-off-by: Yijie Qin --- notify/notify.go | 6 +++++- notify/notify_test.go | 9 +++++---- notify/util.go | 15 ++++++++++++--- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index 2fe4f12f96..c47372854b 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -265,7 +265,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", - }, []string{"integration", "statusCode"}), + }, []string{"integration", "code"}), numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -300,6 +300,10 @@ func NewMetrics(r prometheus.Registerer) *Metrics { m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) + + for _, code := range PossibleFailureStatusCategory { + m.numTotalFailedNotifications.WithLabelValues(integration, code) + } } r.MustRegister( m.numNotifications, m.numTotalFailedNotifications, diff --git a/notify/notify_test.go b/notify/notify_test.go index 7596f50936..577179bced 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -429,10 +429,11 @@ func TestRetryStageWithErrorCode(t *testing.T) { codelabel string expectedCount int }{ - "for 400": {errorcode: 400, codelabel: "4xx", expectedCount: 1}, - "for 402": {errorcode: 402, codelabel: "4xx", expectedCount: 1}, - "for 500": {errorcode: 500, codelabel: "5xx", expectedCount: 1}, - "for 502": {errorcode: 502, codelabel: "5xx", expectedCount: 1}, + "for 400": {errorcode: 400, codelabel: failure4xxCategoryCode, expectedCount: 1}, + "for 402": {errorcode: 402, codelabel: failure4xxCategoryCode, expectedCount: 1}, + "for 500": {errorcode: 500, codelabel: failure5xxCategoryCode, expectedCount: 1}, + "for 502": {errorcode: 502, codelabel: failure5xxCategoryCode, expectedCount: 1}, + "for expected code 100": {errorcode: 100, codelabel: failure5xxCategoryCode, expectedCount: 1}, } for _, testData := range testcases { fail, retry := true, false diff --git a/notify/util.go b/notify/util.go index b3a7a7cd5f..c163cf4ea1 100644 --- a/notify/util.go +++ b/notify/util.go @@ -30,9 +30,18 @@ import ( "github.com/prometheus/alertmanager/types" ) +const ( + failure4xxCategoryCode = "4xx" + failure5xxCategoryCode = "5xx" + failureUnknownCategoryCode = "unknown" +) + // UserAgentHeader is the default User-Agent for notification requests var UserAgentHeader = fmt.Sprintf("Alertmanager/%s", version.Version) +// PossibleFailureStatusCategory is a list of possible failure status code category +var PossibleFailureStatusCategory = []string{failure4xxCategoryCode, failure5xxCategoryCode, failureUnknownCategoryCode} + // RedactURL removes the URL part from an error of *url.Error type. func RedactURL(err error) error { e, ok := err.(*url.Error) @@ -172,13 +181,13 @@ func readAll(r io.Reader) string { // other than 4xx and 5xx input status will return an error. func getFailureStatusCodeCategory(statusCode int) (string, error) { if statusCode/100 == 4 { - return "4xx", nil + return failure4xxCategoryCode, nil } if statusCode/100 == 5 { - return "5xx", nil + return failure5xxCategoryCode, nil } - return "", fmt.Errorf("unexpected status code %v", statusCode) + return failureUnknownCategoryCode, fmt.Errorf("unexpected status code %v", statusCode) } // Retrier knows when to retry an HTTP request to a receiver. 2xx status codes From 1ef398f12d19763e913083ec42d32382f09340b4 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Thu, 3 Nov 2022 10:19:13 -0700 Subject: [PATCH 5/9] remove unnessary line Signed-off-by: Yijie Qin --- notify/notify.go | 1 - 1 file changed, 1 deletion(-) diff --git a/notify/notify.go b/notify/notify.go index c47372854b..d69c5fd136 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -296,7 +296,6 @@ func NewMetrics(r prometheus.Registerer) *Metrics { "telegram", } { m.numNotifications.WithLabelValues(integration) - m.numTotalFailedNotifications.WithLabelValues(integration, "") m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) From 00c66763eb9e6af5f8065805c0f053746cf26ff4 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Wed, 16 Nov 2022 12:53:05 -0500 Subject: [PATCH 6/9] fix comments Signed-off-by: Yijie Qin --- notify/notify.go | 29 ++++++++++++++++++++++++----- notify/util.go | 23 ----------------------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index d69c5fd136..f10fb8aba7 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -54,6 +54,14 @@ const MinTimeout = 10 * time.Second // defaultStatusCodeCategory is the default status code category for numTotalFailedNotifications metric const defaultStatusCodeCategory = "5xx" +const ( + failure4xxCategoryCode = "4xx" + failure5xxCategoryCode = "5xx" +) + +// possibleFailureStatusCategory is a list of possible failure status code category +var possibleFailureStatusCategory = []string{failure4xxCategoryCode, failure5xxCategoryCode} + // Notifier notifies about alerts under constraints of the given context. It // returns an error if unsuccessful and a flag whether the error is // recoverable. This information is useful for a retry logic. @@ -300,7 +308,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) - for _, code := range PossibleFailureStatusCategory { + for _, code := range possibleFailureStatusCategory { m.numTotalFailedNotifications.WithLabelValues(integration, code) } } @@ -672,10 +680,7 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale statusCodeCategory := defaultStatusCodeCategory if err != nil { if e, ok := errors.Cause(err).(*ErrorWithStatusCode); ok { - result, interErr := getFailureStatusCodeCategory(e.StatusCode) - if interErr == nil { - statusCodeCategory = result - } + statusCodeCategory = getFailureStatusCodeCategory(e.StatusCode) } r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), statusCodeCategory).Inc() } @@ -890,3 +895,17 @@ func inTimeIntervals(now time.Time, intervals map[string][]timeinterval.TimeInte } return false, nil } + +// getFailureStatusCodeCategory return the status code category for failure request +// the status starts with 4 will return 4xx and starts with 5 will return 5xx +// other than 4xx and 5xx input status will return an 5xx. +func getFailureStatusCodeCategory(statusCode int) string { + if statusCode/100 == 4 { + return failure4xxCategoryCode + } + if statusCode/100 == 5 { + return failure5xxCategoryCode + } + + return defaultStatusCodeCategory +} diff --git a/notify/util.go b/notify/util.go index c163cf4ea1..3b2ccda504 100644 --- a/notify/util.go +++ b/notify/util.go @@ -30,18 +30,9 @@ import ( "github.com/prometheus/alertmanager/types" ) -const ( - failure4xxCategoryCode = "4xx" - failure5xxCategoryCode = "5xx" - failureUnknownCategoryCode = "unknown" -) - // UserAgentHeader is the default User-Agent for notification requests var UserAgentHeader = fmt.Sprintf("Alertmanager/%s", version.Version) -// PossibleFailureStatusCategory is a list of possible failure status code category -var PossibleFailureStatusCategory = []string{failure4xxCategoryCode, failure5xxCategoryCode, failureUnknownCategoryCode} - // RedactURL removes the URL part from an error of *url.Error type. func RedactURL(err error) error { e, ok := err.(*url.Error) @@ -176,20 +167,6 @@ func readAll(r io.Reader) string { return string(bs) } -// getFailureStatusCodeCategory return the status code category for failure request -// the status starts with 4 will return 4xx and starts with 5 will return 5xx -// other than 4xx and 5xx input status will return an error. -func getFailureStatusCodeCategory(statusCode int) (string, error) { - if statusCode/100 == 4 { - return failure4xxCategoryCode, nil - } - if statusCode/100 == 5 { - return failure5xxCategoryCode, nil - } - - return failureUnknownCategoryCode, fmt.Errorf("unexpected status code %v", statusCode) -} - // Retrier knows when to retry an HTTP request to a receiver. 2xx status codes // are successful, anything else is a failure and only 5xx status codes should // be retried. From b364f981bbf5b46a7a5def9d4be965de34cb7f71 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Mon, 9 Jan 2023 23:05:45 -0500 Subject: [PATCH 7/9] use status reason rather than code Signed-off-by: Yijie Qin --- notify/notify.go | 33 ++++----------------------------- notify/notify_test.go | 16 +++++++--------- notify/sns/sns.go | 4 ++-- notify/util.go | 41 +++++++++++++++++++++++++++++++++-------- 4 files changed, 46 insertions(+), 48 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index f10fb8aba7..d3f13fb962 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -51,17 +51,6 @@ type Peer interface { // to a notification pipeline. const MinTimeout = 10 * time.Second -// defaultStatusCodeCategory is the default status code category for numTotalFailedNotifications metric -const defaultStatusCodeCategory = "5xx" - -const ( - failure4xxCategoryCode = "4xx" - failure5xxCategoryCode = "5xx" -) - -// possibleFailureStatusCategory is a list of possible failure status code category -var possibleFailureStatusCategory = []string{failure4xxCategoryCode, failure5xxCategoryCode} - // Notifier notifies about alerts under constraints of the given context. It // returns an error if unsuccessful and a flag whether the error is // recoverable. This information is useful for a retry logic. @@ -677,12 +666,12 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc() ctx, alerts, err := r.exec(ctx, l, alerts...) - statusCodeCategory := defaultStatusCodeCategory + failureReason := defaultFailureReason if err != nil { - if e, ok := errors.Cause(err).(*ErrorWithStatusCode); ok { - statusCodeCategory = getFailureStatusCodeCategory(e.StatusCode) + if e, ok := errors.Cause(err).(*ErrorWithReason); ok { + failureReason = e.Reason } - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), statusCodeCategory).Inc() + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), failureReason).Inc() } return ctx, alerts, err } @@ -895,17 +884,3 @@ func inTimeIntervals(now time.Time, intervals map[string][]timeinterval.TimeInte } return false, nil } - -// getFailureStatusCodeCategory return the status code category for failure request -// the status starts with 4 will return 4xx and starts with 5 will return 5xx -// other than 4xx and 5xx input status will return an 5xx. -func getFailureStatusCodeCategory(statusCode int) string { - if statusCode/100 == 4 { - return failure4xxCategoryCode - } - if statusCode/100 == 5 { - return failure5xxCategoryCode - } - - return defaultStatusCodeCategory -} diff --git a/notify/notify_test.go b/notify/notify_test.go index 577179bced..6950eadd15 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -425,15 +425,13 @@ func TestRetryStageWithError(t *testing.T) { func TestRetryStageWithErrorCode(t *testing.T) { testcases := map[string]struct { - errorcode int - codelabel string + reason string + reasonlabel string expectedCount int }{ - "for 400": {errorcode: 400, codelabel: failure4xxCategoryCode, expectedCount: 1}, - "for 402": {errorcode: 402, codelabel: failure4xxCategoryCode, expectedCount: 1}, - "for 500": {errorcode: 500, codelabel: failure5xxCategoryCode, expectedCount: 1}, - "for 502": {errorcode: 502, codelabel: failure5xxCategoryCode, expectedCount: 1}, - "for expected code 100": {errorcode: 100, codelabel: failure5xxCategoryCode, expectedCount: 1}, + "for clientError": {reason: clientError, reasonlabel: clientError, expectedCount: 1}, + "for serverError": {reason: serverError, reasonlabel: serverError, expectedCount: 1}, + "for unexpected code": {reason: "unexpected", reasonlabel: defaultFailureReason, expectedCount: 1}, } for _, testData := range testcases { fail, retry := true, false @@ -444,7 +442,7 @@ func TestRetryStageWithErrorCode(t *testing.T) { notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { if fail { fail = false - return retry, NewErrorWithStatusCode(testData.errorcode, errors.New("fail to deliver notification")) + return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification")) } sent = append(sent, alerts...) return false, nil @@ -471,7 +469,7 @@ func TestRetryStageWithErrorCode(t *testing.T) { resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...) counter := r.metrics.numTotalFailedNotifications - require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.codelabel)))) + require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.reasonlabel)))) require.NotNil(t, err) require.NotNil(t, resctx) diff --git a/notify/sns/sns.go b/notify/sns/sns.go index c4221eaa10..b9881b0175 100644 --- a/notify/sns/sns.go +++ b/notify/sns/sns.go @@ -85,8 +85,8 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err if e, ok := err.(awserr.RequestFailure); ok { retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) - statusErr := notify.NewErrorWithStatusCode(e.StatusCode(), error) - return retryable, statusErr + reasonErr := notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(e.StatusCode()), error) + return retryable, reasonErr } return true, err } diff --git a/notify/util.go b/notify/util.go index 3b2ccda504..98d8943641 100644 --- a/notify/util.go +++ b/notify/util.go @@ -30,6 +30,17 @@ import ( "github.com/prometheus/alertmanager/types" ) +// defaultFailureReason is the default reason for numTotalFailedNotifications metric +const defaultFailureReason = "other" + +const ( + clientError = "clientError" + serverError = "ServerError" +) + +// possibleFailureStatusCategory is a list of possible failure reason +var possibleFailureStatusCategory = []string{clientError, serverError, defaultFailureReason} + // UserAgentHeader is the default User-Agent for notification requests var UserAgentHeader = fmt.Sprintf("Alertmanager/%s", version.Version) @@ -210,20 +221,34 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) { return retry, errors.New(s) } -type ErrorWithStatusCode struct { +type ErrorWithReason struct { Err error - // The status code of the HTTP response. - StatusCode int + // The reason of the failure. + Reason string } -func NewErrorWithStatusCode(statusCode int, err error) *ErrorWithStatusCode { - return &ErrorWithStatusCode{ - Err: err, - StatusCode: statusCode, +func NewErrorWithReason(reason string, err error) *ErrorWithReason { + return &ErrorWithReason{ + Err: err, + Reason: reason, } } -func (e *ErrorWithStatusCode) Error() string { +func (e *ErrorWithReason) Error() string { return e.Err.Error() } + +// GetFailureReasonFromStatusCode return the reason for failure request +// the status starts with 4 will return 4xx and starts with 5 will return 5xx +// other than 4xx and 5xx input status will return an 5xx. +func GetFailureReasonFromStatusCode(statusCode int) string { + if statusCode/100 == 4 { + return clientError + } + if statusCode/100 == 5 { + return serverError + } + + return defaultFailureReason +} From b28f77133f03988ae71766efbe8cf29b5acf051d Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Tue, 10 Jan 2023 11:26:38 -0500 Subject: [PATCH 8/9] use iota for reason type Signed-off-by: Yijie Qin --- notify/notify.go | 10 ++++----- notify/notify_test.go | 24 ++++++++++------------ notify/util.go | 48 ++++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index d3f13fb962..fc04cbbe32 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -262,7 +262,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", - }, []string{"integration", "code"}), + }, []string{"integration", "reason"}), numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -297,8 +297,8 @@ func NewMetrics(r prometheus.Registerer) *Metrics { m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) - for _, code := range possibleFailureStatusCategory { - m.numTotalFailedNotifications.WithLabelValues(integration, code) + for _, reason := range possibleFailureReasonCategory { + m.numTotalFailedNotifications.WithLabelValues(integration, reason) } } r.MustRegister( @@ -666,10 +666,10 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc() ctx, alerts, err := r.exec(ctx, l, alerts...) - failureReason := defaultFailureReason + failureReason := DefaultReason.String() if err != nil { if e, ok := errors.Cause(err).(*ErrorWithReason); ok { - failureReason = e.Reason + failureReason = e.Reason.String() } r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), failureReason).Inc() } diff --git a/notify/notify_test.go b/notify/notify_test.go index 6950eadd15..fb4a9a253d 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -425,27 +425,25 @@ func TestRetryStageWithError(t *testing.T) { func TestRetryStageWithErrorCode(t *testing.T) { testcases := map[string]struct { - reason string - reasonlabel string - expectedCount int + isNewErrorWithReason bool + reason Reason + reasonlabel string + expectedCount int }{ - "for clientError": {reason: clientError, reasonlabel: clientError, expectedCount: 1}, - "for serverError": {reason: serverError, reasonlabel: serverError, expectedCount: 1}, - "for unexpected code": {reason: "unexpected", reasonlabel: defaultFailureReason, expectedCount: 1}, + "for clientError": {isNewErrorWithReason: true, reason: ClientErrorReason, reasonlabel: ClientErrorReason.String(), expectedCount: 1}, + "for serverError": {isNewErrorWithReason: true, reason: ServerErrorReason, reasonlabel: ServerErrorReason.String(), expectedCount: 1}, + "for unexpected code": {isNewErrorWithReason: false, reason: DefaultReason, reasonlabel: DefaultReason.String(), expectedCount: 1}, } for _, testData := range testcases { - fail, retry := true, false - sent := []*types.Alert{} + retry := false testData := testData i := Integration{ name: "test", notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { - if fail { - fail = false - return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification")) + if !testData.isNewErrorWithReason { + return retry, errors.New("fail to deliver notification") } - sent = append(sent, alerts...) - return false, nil + return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification")) }), rs: sendResolved(false), } diff --git a/notify/util.go b/notify/util.go index 98d8943641..21c41a0538 100644 --- a/notify/util.go +++ b/notify/util.go @@ -30,17 +30,6 @@ import ( "github.com/prometheus/alertmanager/types" ) -// defaultFailureReason is the default reason for numTotalFailedNotifications metric -const defaultFailureReason = "other" - -const ( - clientError = "clientError" - serverError = "ServerError" -) - -// possibleFailureStatusCategory is a list of possible failure reason -var possibleFailureStatusCategory = []string{clientError, serverError, defaultFailureReason} - // UserAgentHeader is the default User-Agent for notification requests var UserAgentHeader = fmt.Sprintf("Alertmanager/%s", version.Version) @@ -225,10 +214,10 @@ type ErrorWithReason struct { Err error // The reason of the failure. - Reason string + Reason Reason } -func NewErrorWithReason(reason string, err error) *ErrorWithReason { +func NewErrorWithReason(reason Reason, err error) *ErrorWithReason { return &ErrorWithReason{ Err: err, Reason: reason, @@ -239,16 +228,41 @@ func (e *ErrorWithReason) Error() string { return e.Err.Error() } +// Reason is the failure reason +type Reason int + +const ( + DefaultReason Reason = iota + ClientErrorReason + ServerErrorReason +) + +func (s Reason) String() string { + switch s { + case DefaultReason: + return "other" + case ClientErrorReason: + return "clientError" + case ServerErrorReason: + return "serverError" + default: + panic(fmt.Sprintf("unknown Reason: %d", s)) + } +} + +// possibleFailureReasonCategory is a list of possible failure reason +var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()} + // GetFailureReasonFromStatusCode return the reason for failure request // the status starts with 4 will return 4xx and starts with 5 will return 5xx // other than 4xx and 5xx input status will return an 5xx. -func GetFailureReasonFromStatusCode(statusCode int) string { +func GetFailureReasonFromStatusCode(statusCode int) Reason { if statusCode/100 == 4 { - return clientError + return ClientErrorReason } if statusCode/100 == 5 { - return serverError + return ServerErrorReason } - return defaultFailureReason + return DefaultReason } From 3f5d3a221d3dceca6444db7f69c439a9384e6f27 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Thu, 2 Feb 2023 13:33:56 -0500 Subject: [PATCH 9/9] address comment Signed-off-by: Yijie Qin --- notify/util.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/notify/util.go b/notify/util.go index 21c41a0538..355ca11bc9 100644 --- a/notify/util.go +++ b/notify/util.go @@ -213,7 +213,6 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) { type ErrorWithReason struct { Err error - // The reason of the failure. Reason Reason } @@ -228,7 +227,7 @@ func (e *ErrorWithReason) Error() string { return e.Err.Error() } -// Reason is the failure reason +// Reason is the failure reason. type Reason int const ( @@ -250,12 +249,10 @@ func (s Reason) String() string { } } -// possibleFailureReasonCategory is a list of possible failure reason +// possibleFailureReasonCategory is a list of possible failure reason. var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()} -// GetFailureReasonFromStatusCode return the reason for failure request -// the status starts with 4 will return 4xx and starts with 5 will return 5xx -// other than 4xx and 5xx input status will return an 5xx. +// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided. func GetFailureReasonFromStatusCode(statusCode int) Reason { if statusCode/100 == 4 { return ClientErrorReason