Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creating new metric for filtering 4xx errors #2927

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ const MinTimeout = 10 * time.Second
// returns an error if unsuccessful and a flag whether the error is
// recoverable. This information is useful for a retry logic.
type Notifier interface {
Notify(context.Context, ...*types.Alert) (bool, error)
Notify(context.Context, ...*types.Alert) (bool, bool, error)
}

// Integration wraps a notifier and its configuration to be uniquely identified
Expand All @@ -78,7 +78,7 @@ func NewIntegration(notifier Notifier, rs ResolvedSender, name string, idx int)
}

// Notify implements the Notifier interface.
func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
return i.notifier.Notify(ctx, alerts...)
}

Expand Down Expand Up @@ -246,6 +246,7 @@ type NotificationLog interface {
type Metrics struct {
numNotifications *prometheus.CounterVec
numTotalFailedNotifications *prometheus.CounterVec
numTotal4xxFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec
Expand All @@ -263,6 +264,11 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
numTotal4xxFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_4xx_failed_total",
Help: "The total number of failed 4xx notifications.",
}, []string{"integration"}),
numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_requests_total",
Expand Down Expand Up @@ -294,12 +300,13 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
} {
m.numNotifications.WithLabelValues(integration)
m.numTotalFailedNotifications.WithLabelValues(integration)
m.numTotal4xxFailedNotifications.WithLabelValues(integration)
m.numNotificationRequestsTotal.WithLabelValues(integration)
m.numNotificationRequestsFailedTotal.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)
}
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
m.numNotifications, m.numTotalFailedNotifications, m.numTotal4xxFailedNotifications,
m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
m.notificationLatencySeconds,
)
Expand Down Expand Up @@ -719,11 +726,14 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
select {
case <-tick.C:
now := time.Now()
retry, err := r.integration.Notify(ctx, sent...)
retry, is4xx, err := r.integration.Notify(ctx, sent...)
r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds())
r.metrics.numNotificationRequestsTotal.WithLabelValues(r.integration.Name()).Inc()
if err != nil {
r.metrics.numNotificationRequestsFailedTotal.WithLabelValues(r.integration.Name()).Inc()
if is4xx {
r.metrics.numTotal4xxFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
}
if !retry {
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
}
Expand Down
92 changes: 83 additions & 9 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@ import (
"fmt"
"io"
"reflect"
"strconv"
"testing"
"time"

"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"gopkg.in/yaml.v2"
Expand All @@ -42,9 +45,9 @@ func (s sendResolved) SendResolved() bool {
return bool(s)
}

type notifierFunc func(ctx context.Context, alerts ...*types.Alert) (bool, error)
type notifierFunc func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error)

func (f notifierFunc) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
func (f notifierFunc) Notify(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
return f(ctx, alerts...)
}

Expand Down Expand Up @@ -380,13 +383,13 @@ func TestRetryStageWithError(t *testing.T) {
fail, retry := true, true
sent := []*types.Alert{}
i := Integration{
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
if fail {
fail = false
return retry, errors.New("fail to deliver notification")
return retry, false, errors.New("fail to deliver notification")
}
sent = append(sent, alerts...)
return false, nil
return false, false, nil
}),
rs: sendResolved(false),
}
Expand Down Expand Up @@ -425,9 +428,9 @@ func TestRetryStageWithError(t *testing.T) {
func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
sent = append(sent, alerts...)
return false, nil
return false, false, nil
}),
rs: sendResolved(false),
}
Expand Down Expand Up @@ -479,9 +482,9 @@ func TestRetryStageNoResolved(t *testing.T) {
func TestRetryStageSendResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
sent = append(sent, alerts...)
return false, nil
return false, false, nil
}),
rs: sendResolved(true),
}
Expand Down Expand Up @@ -913,3 +916,74 @@ func BenchmarkHashAlert(b *testing.B) {
hashAlert(alert)
}
}

func TestRetryStageWithErrorCode(t *testing.T) {

testcases := map[string]struct {
errorcode int
expected4xxCount int
}{

"for 400": {errorcode: 400, expected4xxCount: 1},
"for 401": {errorcode: 401, expected4xxCount: 1},
"for 403": {errorcode: 403, expected4xxCount: 1},
"for 404": {errorcode: 404, expected4xxCount: 1},
"for 413": {errorcode: 413, expected4xxCount: 1},
"for 422": {errorcode: 422, expected4xxCount: 1},
"for 429": {errorcode: 429, expected4xxCount: 1},
"for 500": {errorcode: 500, expected4xxCount: 0},
"for 502": {errorcode: 502, expected4xxCount: 0},
}
for _, testData := range testcases {
fail, retry := true, true
sent := []*types.Alert{}
testData := testData
is4xx := Check4xxStatus(testData.errorcode)
i := Integration{
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) {
if fail {
fail = false
return retry, is4xx, awserr.New(strconv.Itoa(testData.errorcode), "Not found", errors.New("fail to deliver notification"))
}
sent = append(sent, alerts...)
return false, is4xx, nil
}),
rs: sendResolved(false),
}
r := RetryStage{
integration: i,
metrics: NewMetrics(prometheus.NewRegistry()),
}

alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}

ctx := context.Background()
ctx = WithFiringAlerts(ctx, []uint64{0})

// Notify with a recoverable error should retry and succeed.
resctx, res, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
var counter = r.metrics.numTotal4xxFailedNotifications
require.Equal(t, testData.expected4xxCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name()))))

require.Nil(t, err)
require.Equal(t, alerts, res)
require.Equal(t, alerts, sent)
require.NotNil(t, resctx)

// Notify with an unrecoverable error should fail.
sent = sent[:0]
fail = true
retry = false

resctx, _, err = r.Exec(ctx, log.NewNopLogger(), alerts...)
require.NotNil(t, err)
require.NotNil(t, resctx)

}
}
29 changes: 19 additions & 10 deletions notify/sns/sns.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,37 +60,46 @@ func New(c *config.SNSConfig, t *template.Template, l log.Logger, httpOpts ...co
}, nil
}

func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, error) {
func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, bool, error) {
var (
err error
data = notify.GetTemplateData(ctx, n.tmpl, alert, n.logger)
tmpl = notify.TmplText(n.tmpl, data, &err)
err error
data = notify.GetTemplateData(ctx, n.tmpl, alert, n.logger)
tmpl = notify.TmplText(n.tmpl, data, &err)
is4xx = false
retry = false
)

client, err := n.createSNSClient(tmpl)
if err != nil {
if e, ok := err.(awserr.RequestFailure); ok {
return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
is4xx := notify.Check4xxStatus(e.StatusCode())
retry, err = n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
return retry, is4xx, err
}
return true, err
return true, is4xx, err
}

publishInput, err := n.createPublishInput(ctx, tmpl)
if err != nil {
return true, err
if e, ok := err.(awserr.RequestFailure); ok {
is4xx = notify.Check4xxStatus(e.StatusCode())
}
return true, is4xx, err
}

publishOutput, err := client.Publish(publishInput)
if err != nil {
if e, ok := err.(awserr.RequestFailure); ok {
return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
is4xx = notify.Check4xxStatus(e.StatusCode())
retry, err = n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
return retry, is4xx, err
}
return true, err
return true, is4xx, err
}

level.Debug(n.logger).Log("msg", "SNS message successfully published", "message_id", publishOutput.MessageId, "sequence number", publishOutput.SequenceNumber)

return false, nil
return false, is4xx, nil
}

func (n *Notifier) createSNSClient(tmpl func(string) string) (*sns.SNS, error) {
Expand Down
8 changes: 8 additions & 0 deletions notify/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,11 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) {
}
return retry, errors.New(s)
}

func Check4xxStatus(statusCode int) bool {
var is4xx = false
if statusCode/100 == 4 {
is4xx = true
}
return is4xx
}