From 22e067fe78b3c4e9234a30e98a221a2deb4c31f9 Mon Sep 17 00:00:00 2001 From: Yijie Qin Date: Fri, 3 Feb 2023 07:09:21 -0500 Subject: [PATCH] add status code label to the numTotalFailedNotifications metric (#3094) * add reason label to the numTotalFailedNotifications metric Signed-off-by: Yijie Qin --- notify/notify.go | 14 ++++++++--- notify/notify_test.go | 52 +++++++++++++++++++++++++++++++++++++++++ notify/sns/sns.go | 5 +++- notify/util.go | 54 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 4 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index f0691ba0eb..d205d5cac2 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -262,7 +262,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics { Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", - }, []string{"integration"}), + }, []string{"integration", "reason"}), numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -293,10 +293,13 @@ func NewMetrics(r prometheus.Registerer) *Metrics { "telegram", } { m.numNotifications.WithLabelValues(integration) - m.numTotalFailedNotifications.WithLabelValues(integration) m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) + + for _, reason := range possibleFailureReasonCategory { + m.numTotalFailedNotifications.WithLabelValues(integration, reason) + } } r.MustRegister( m.numNotifications, m.numTotalFailedNotifications, @@ -662,8 +665,13 @@ func NewRetryStage(i Integration, groupName string, metrics *Metrics) *RetryStag func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc() ctx, alerts, err := r.exec(ctx, l, alerts...) + + failureReason := DefaultReason.String() if err != nil { - r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name()).Inc() + if e, ok := errors.Cause(err).(*ErrorWithReason); ok { + failureReason = e.Reason.String() + } + r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), failureReason).Inc() } return ctx, alerts, err } diff --git a/notify/notify_test.go b/notify/notify_test.go index b7f6c5e0ce..996c132dd8 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -24,6 +24,7 @@ import ( "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -422,6 +423,57 @@ func TestRetryStageWithError(t *testing.T) { require.NotNil(t, resctx) } +func TestRetryStageWithErrorCode(t *testing.T) { + testcases := map[string]struct { + isNewErrorWithReason bool + reason Reason + reasonlabel string + expectedCount int + }{ + "for clientError": {isNewErrorWithReason: true, reason: ClientErrorReason, reasonlabel: ClientErrorReason.String(), expectedCount: 1}, + "for serverError": {isNewErrorWithReason: true, reason: ServerErrorReason, reasonlabel: ServerErrorReason.String(), expectedCount: 1}, + "for unexpected code": {isNewErrorWithReason: false, reason: DefaultReason, reasonlabel: DefaultReason.String(), expectedCount: 1}, + } + for _, testData := range testcases { + retry := false + testData := testData + i := Integration{ + name: "test", + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { + if !testData.isNewErrorWithReason { + return retry, errors.New("fail to deliver notification") + } + return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification")) + }), + rs: sendResolved(false), + } + r := RetryStage{ + integration: i, + metrics: NewMetrics(prometheus.NewRegistry()), + } + + alerts := []*types.Alert{ + { + Alert: model.Alert{ + EndsAt: time.Now().Add(time.Hour), + }, + }, + } + + ctx := context.Background() + ctx = WithFiringAlerts(ctx, []uint64{0}) + + // Notify with a non-recoverable error. + resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...) + counter := r.metrics.numTotalFailedNotifications + + require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.reasonlabel)))) + + require.NotNil(t, err) + require.NotNil(t, resctx) + } +} + func TestRetryStageNoResolved(t *testing.T) { sent := []*types.Alert{} i := Integration{ diff --git a/notify/sns/sns.go b/notify/sns/sns.go index 1dfd42b4a5..b9881b0175 100644 --- a/notify/sns/sns.go +++ b/notify/sns/sns.go @@ -83,7 +83,10 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err publishOutput, err := client.Publish(publishInput) if err != nil { if e, ok := err.(awserr.RequestFailure); ok { - return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + + reasonErr := notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(e.StatusCode()), error) + return retryable, reasonErr } return true, err } diff --git a/notify/util.go b/notify/util.go index 6b71ac7b29..706856c160 100644 --- a/notify/util.go +++ b/notify/util.go @@ -245,3 +245,57 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) { } return retry, errors.New(s) } + +type ErrorWithReason struct { + Err error + + Reason Reason +} + +func NewErrorWithReason(reason Reason, err error) *ErrorWithReason { + return &ErrorWithReason{ + Err: err, + Reason: reason, + } +} + +func (e *ErrorWithReason) Error() string { + return e.Err.Error() +} + +// Reason is the failure reason. +type Reason int + +const ( + DefaultReason Reason = iota + ClientErrorReason + ServerErrorReason +) + +func (s Reason) String() string { + switch s { + case DefaultReason: + return "other" + case ClientErrorReason: + return "clientError" + case ServerErrorReason: + return "serverError" + default: + panic(fmt.Sprintf("unknown Reason: %d", s)) + } +} + +// possibleFailureReasonCategory is a list of possible failure reason. +var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()} + +// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided. +func GetFailureReasonFromStatusCode(statusCode int) Reason { + if statusCode/100 == 4 { + return ClientErrorReason + } + if statusCode/100 == 5 { + return ServerErrorReason + } + + return DefaultReason +}