Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add reason label to the numTotalFailedNotifications metric #3094

Merged
merged 10 commits into from
Feb 3, 2023
14 changes: 11 additions & 3 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
Namespace: "alertmanager",
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
}, []string{"integration", "reason"}),
numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_requests_total",
Expand Down Expand Up @@ -293,10 +293,13 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
"telegram",
} {
m.numNotifications.WithLabelValues(integration)
m.numTotalFailedNotifications.WithLabelValues(integration)
m.numNotificationRequestsTotal.WithLabelValues(integration)
m.numNotificationRequestsFailedTotal.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)

for _, reason := range possibleFailureReasonCategory {
m.numTotalFailedNotifications.WithLabelValues(integration, reason)
}
}
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
Expand Down Expand Up @@ -662,8 +665,13 @@ func NewRetryStage(i Integration, groupName string, metrics *Metrics) *RetryStag
func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
ctx, alerts, err := r.exec(ctx, l, alerts...)

failureReason := DefaultReason.String()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've filed #3231 so that we can do the same for all other notifiers.

if err != nil {
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
if e, ok := errors.Cause(err).(*ErrorWithReason); ok {
failureReason = e.Reason.String()
}
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), failureReason).Inc()
}
return ctx, alerts, err
}
Expand Down
52 changes: 52 additions & 0 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"gopkg.in/yaml.v2"
Expand Down Expand Up @@ -422,6 +423,57 @@ func TestRetryStageWithError(t *testing.T) {
require.NotNil(t, resctx)
}

func TestRetryStageWithErrorCode(t *testing.T) {
testcases := map[string]struct {
isNewErrorWithReason bool
reason Reason
reasonlabel string
expectedCount int
}{
"for clientError": {isNewErrorWithReason: true, reason: ClientErrorReason, reasonlabel: ClientErrorReason.String(), expectedCount: 1},
"for serverError": {isNewErrorWithReason: true, reason: ServerErrorReason, reasonlabel: ServerErrorReason.String(), expectedCount: 1},
"for unexpected code": {isNewErrorWithReason: false, reason: DefaultReason, reasonlabel: DefaultReason.String(), expectedCount: 1},
}
for _, testData := range testcases {
retry := false
testData := testData
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
if !testData.isNewErrorWithReason {
return retry, errors.New("fail to deliver notification")
}
return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification"))
}),
rs: sendResolved(false),
}
r := RetryStage{
integration: i,
metrics: NewMetrics(prometheus.NewRegistry()),
}

alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}

ctx := context.Background()
ctx = WithFiringAlerts(ctx, []uint64{0})

// Notify with a non-recoverable error.
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
counter := r.metrics.numTotalFailedNotifications

require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.reasonlabel))))

require.NotNil(t, err)
require.NotNil(t, resctx)
}
}

func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
Expand Down
5 changes: 4 additions & 1 deletion notify/sns/sns.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err
publishOutput, err := client.Publish(publishInput)
if err != nil {
if e, ok := err.(awserr.RequestFailure); ok {
return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))

reasonErr := notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(e.StatusCode()), error)
return retryable, reasonErr
}
return true, err
}
Expand Down
54 changes: 54 additions & 0 deletions notify/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,57 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) {
}
return retry, errors.New(s)
}

type ErrorWithReason struct {
Err error

Reason Reason
}

func NewErrorWithReason(reason Reason, err error) *ErrorWithReason {
return &ErrorWithReason{
Err: err,
Reason: reason,
}
}

func (e *ErrorWithReason) Error() string {
return e.Err.Error()
}

// Reason is the failure reason.
type Reason int

const (
DefaultReason Reason = iota
ClientErrorReason
ServerErrorReason
)

func (s Reason) String() string {
switch s {
case DefaultReason:
return "other"
case ClientErrorReason:
return "clientError"
case ServerErrorReason:
return "serverError"
default:
panic(fmt.Sprintf("unknown Reason: %d", s))
}
}

// possibleFailureReasonCategory is a list of possible failure reason.
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}

// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
func GetFailureReasonFromStatusCode(statusCode int) Reason {
if statusCode/100 == 4 {
return ClientErrorReason
}
if statusCode/100 == 5 {
return ServerErrorReason
}

return DefaultReason
}