Skip to content

Commit

Permalink
add status code label to the numTotalFailedNotifications metric
Browse files Browse the repository at this point in the history
Signed-off-by: Yijie Qin <qinyijie@amazon.com>
  • Loading branch information
qinxx108 committed Oct 7, 2022
1 parent 2c04c5a commit 61b7887
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 4 deletions.
13 changes: 10 additions & 3 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ type Peer interface {
// to a notification pipeline.
const MinTimeout = 10 * time.Second

// defaultStatusCode is the default status code for numTotalFailedNotifications metric
const defaultStatusCode = "5xx"

// Notifier notifies about alerts under constraints of the given context. It
// returns an error if unsuccessful and a flag whether the error is
// recoverable. This information is useful for a retry logic.
Expand Down Expand Up @@ -262,7 +265,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
Namespace: "alertmanager",
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
}, []string{"integration", "statusCode"}),
numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_requests_total",
Expand Down Expand Up @@ -293,7 +296,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
"telegram",
} {
m.numNotifications.WithLabelValues(integration)
m.numTotalFailedNotifications.WithLabelValues(integration)
m.numTotalFailedNotifications.WithLabelValues(integration, "")
m.numNotificationRequestsTotal.WithLabelValues(integration)
m.numNotificationRequestsFailedTotal.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)
Expand Down Expand Up @@ -663,7 +666,11 @@ func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
ctx, alerts, err := r.exec(ctx, l, alerts...)
if err != nil {
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
if e, ok := errors.Cause(err).(*ErrorWithStatusCode); ok {
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), getFailureStatusCodeCategory(e.StatusCode)).Inc()
} else {
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), defaultStatusCode).Inc()
}
}
return ctx, alerts, err
}
Expand Down
55 changes: 55 additions & 0 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"gopkg.in/yaml.v2"
Expand Down Expand Up @@ -422,6 +423,60 @@ func TestRetryStageWithError(t *testing.T) {
require.NotNil(t, resctx)
}

func TestRetryStageWithErrorCode(t *testing.T) {
testcases := map[string]struct {
errorcode int
codelabel string
expectedCount int
}{
"for 400": {errorcode: 400, codelabel: "4xx", expectedCount: 1},
"for 402": {errorcode: 402, codelabel: "4xx", expectedCount: 1},
"for 500": {errorcode: 500, codelabel: "5xx", expectedCount: 1},
"for 502": {errorcode: 502, codelabel: "5xx", expectedCount: 1},
}
for _, testData := range testcases {
fail, retry := true, false
sent := []*types.Alert{}
testData := testData
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
if fail {
fail = false
return retry, NewErrorWithStatusCode(testData.errorcode, errors.New("fail to deliver notification"))
}
sent = append(sent, alerts...)
return false, nil
}),
rs: sendResolved(false),
}
r := RetryStage{
integration: i,
metrics: NewMetrics(prometheus.NewRegistry()),
}

alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}

ctx := context.Background()
ctx = WithFiringAlerts(ctx, []uint64{0})

// Notify with a non-recoverable error.
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
counter := r.metrics.numTotalFailedNotifications

require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.codelabel))))

require.NotNil(t, err)
require.NotNil(t, resctx)
}
}

func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
Expand Down
5 changes: 4 additions & 1 deletion notify/sns/sns.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err
publishOutput, err := client.Publish(publishInput)
if err != nil {
if e, ok := err.(awserr.RequestFailure); ok {
return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))

statusErr := notify.NewErrorWithStatusCode(e.StatusCode(), error)
return retryable, statusErr
}
return true, err
}
Expand Down
29 changes: 29 additions & 0 deletions notify/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,17 @@ func readAll(r io.Reader) string {
return string(bs)
}

func getFailureStatusCodeCategory(statusCode int) string {
if statusCode/100 == 4 {
return "4xx"
}
if statusCode/100 == 5 {
return "5xx"
}

return ""
}

// Retrier knows when to retry an HTTP request to a receiver. 2xx status codes
// are successful, anything else is a failure and only 5xx status codes should
// be retried.
Expand Down Expand Up @@ -209,3 +220,21 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) {
}
return retry, errors.New(s)
}

type ErrorWithStatusCode struct {
Err error

// The status code of the HTTP response.
StatusCode int
}

func NewErrorWithStatusCode(statusCode int, err error) *ErrorWithStatusCode {
return &ErrorWithStatusCode{
Err: err,
StatusCode: statusCode,
}
}

func (e *ErrorWithStatusCode) Error() string {
return e.Err.Error()
}

0 comments on commit 61b7887

Please sign in to comment.