From 6e8274f3ab850cda4157051d26caed9451c12777 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Tue, 16 Mar 2021 15:14:52 +0530 Subject: [PATCH 01/85] AlertingNG: Add alert provider and basic structure with dispatcher, silences and delivery stages (#31833) * AlertingNG: Add alert provider Signed-off-by: Ganesh Vernekar * Add unit tests Signed-off-by: Ganesh Vernekar * Alertmanager WIP * Merge alertmanager into notifier Signed-off-by: Ganesh Vernekar * Fixes for PR 31833 (#31990) Signed-off-by: Ganesh Vernekar * Use alertmanager from upgrad-uuid temporarily to unblock Signed-off-by: Ganesh Vernekar * Fix lint Signed-off-by: Ganesh Vernekar Co-authored-by: Josue Abreu --- pkg/services/ngalert/notifier/alertmanager.go | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 pkg/services/ngalert/notifier/alertmanager.go diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go new file mode 100644 index 00000000..91fc4ab6 --- /dev/null +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -0,0 +1,275 @@ +package notifier + +import ( + "context" + "path/filepath" + "sort" + "sync" + "time" + + gokit_log "github.com/go-kit/kit/log" + "github.com/pkg/errors" + "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/dispatch" + "github.com/prometheus/alertmanager/nflog" + "github.com/prometheus/alertmanager/nflog/nflogpb" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/pkg/labels" + "github.com/prometheus/alertmanager/silence" + "github.com/prometheus/alertmanager/silence/silencepb" + "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/registry" + "github.com/grafana/grafana/pkg/setting" +) + +type Alertmanager struct { + logger log.Logger + + // notificationLog keeps tracks of which notifications we've fired already. + notificationLog *nflog.Log + // silences keeps the track of which notifications we should not fire due to user configuration. + silences *silence.Silences + marker types.Marker + alerts *AlertProvider + dispatcher *dispatch.Dispatcher + + wg sync.WaitGroup +} + +type WithReceiverStage struct { +} + +func (s *WithReceiverStage) Exec(ctx context.Context, l gokit_log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { + //TODO: Alerts with a receiver should be handled here. + return ctx, nil, nil +} + +func init() { + registry.RegisterService(&Alertmanager{}) +} + +func (am *Alertmanager) IsDisabled() bool { + return !setting.AlertingEnabled || !setting.ExecuteAlerts +} + +func (am *Alertmanager) Init() error { + am.logger = log.New("alertmanager") + + return nil +} + +func (am *Alertmanager) Run(ctx context.Context) error { + //TODO: Speak with David Parrot wrt to the marker, we'll probably need our own. + am.marker = types.NewMarker(prometheus.DefaultRegisterer) + + var err error + am.alerts, err = NewAlertProvider(&WithReceiverStage{}, am.marker, gokit_log.NewNopLogger()) + if err != nil { + return errors.Wrap(err, "failed to initialize alerting storage component") + } + + am.silences, err = silence.New(silence.Options{ + SnapshotFile: filepath.Join("dir", "silences"), //TODO: This is a setting + Retention: time.Hour * 24, //TODO: This is also a setting + }) + if err != nil { + return errors.Wrap(err, "unable to initialize the silencing component of alerting") + } + + am.notificationLog, err = nflog.New( + nflog.WithRetention(time.Hour*24), //TODO: This is a setting. + nflog.WithSnapshot(filepath.Join("dir", "notifications")), //TODO: This should be a setting + ) + if err != nil { + return errors.Wrap(err, "unable to initialize the notification log component of alerting") + } + + { + // Now, let's put together our notification pipeline + receivers := buildIntegrationsMap() + routingStage := make(notify.RoutingStage, len(receivers)) + + silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) + //TODO: We need to unify these receivers + for name := range receivers { + stage := createReceiverStage(name, receivers[name], waitFunc, am.notificationLog) + routingStage[name] = notify.MultiStage{silencingStage, stage} + } + am.dispatcher = dispatch.NewDispatcher(am.alerts, BuildRoutingConfiguration(), routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) + } + + am.wg.Add(1) + go am.dispatcher.Run() + return nil +} + +// CreateAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not +func (am *Alertmanager) CreateAlerts(alerts ...*PostableAlert) error { + return am.alerts.PutPostableAlert(alerts...) +} + +func (am *Alertmanager) ListSilences(matchers []*labels.Matcher) ([]types.Silence, error) { + pbsilences, _, err := am.silences.Query() + if err != nil { + return nil, errors.Wrap(err, "unable to query for the list of silences") + } + r := []types.Silence{} + for _, pbs := range pbsilences { + s, err := silenceFromProto(pbs) + if err != nil { + return nil, errors.Wrap(err, "unable to marshal silence") + } + + sms := make(map[string]string) + for _, m := range s.Matchers { + sms[m.Name] = m.Value + } + + if !matchFilterLabels(matchers, sms) { + continue + } + + r = append(r, *s) + } + + var active, pending, expired []types.Silence + for _, s := range r { + switch s.Status.State { + case types.SilenceStateActive: + active = append(active, s) + case types.SilenceStatePending: + pending = append(pending, s) + case types.SilenceStateExpired: + expired = append(expired, s) + } + } + + sort.Slice(active, func(i int, j int) bool { + return active[i].EndsAt.Before(active[j].EndsAt) + }) + sort.Slice(pending, func(i int, j int) bool { + return pending[i].StartsAt.Before(pending[j].EndsAt) + }) + sort.Slice(expired, func(i int, j int) bool { + return expired[i].EndsAt.After(expired[j].EndsAt) + }) + + // Initialize silences explicitly to an empty list (instead of nil) + // So that it does not get converted to "null" in JSON. + silences := []types.Silence{} + silences = append(silences, active...) + silences = append(silences, pending...) + silences = append(silences, expired...) + + return silences, nil +} + +func (am *Alertmanager) GetSilence(silence *types.Silence) {} +func (am *Alertmanager) CreateSilence(silence *types.Silence) {} +func (am *Alertmanager) DeleteSilence(silence *types.Silence) {} + +// createReceiverStage creates a pipeline of stages for a receiver. +func createReceiverStage(name string, integrations []notify.Integration, wait func() time.Duration, notificationLog notify.NotificationLog) notify.Stage { + var fs notify.FanoutStage + for i := range integrations { + recv := &nflogpb.Receiver{ + GroupName: name, + Integration: integrations[i].Name(), + Idx: uint32(integrations[i].Index()), + } + var s notify.MultiStage + s = append(s, notify.NewWaitStage(wait)) + s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv)) + //TODO: This probably won't work w/o the metrics + s = append(s, notify.NewRetryStage(integrations[i], name, nil)) + s = append(s, notify.NewSetNotifiesStage(notificationLog, recv)) + + fs = append(fs, s) + } + return fs +} + +// BuildRoutingConfiguration produces an alertmanager-based routing configuration. +func BuildRoutingConfiguration() *dispatch.Route { + var cfg *config.Config + return dispatch.NewRoute(cfg.Route, nil) +} + +func buildIntegrationsMap() map[string][]notify.Integration { + return map[string][]notify.Integration{} +} + +func waitFunc() time.Duration { + return setting.AlertingNotificationTimeout +} + +func timeoutFunc(d time.Duration) time.Duration { + //TODO: What does MinTimeout means here? + if d < notify.MinTimeout { + d = notify.MinTimeout + } + return d + waitFunc() +} + +// copied from the Alertmanager +func silenceFromProto(s *silencepb.Silence) (*types.Silence, error) { + sil := &types.Silence{ + ID: s.Id, + StartsAt: s.StartsAt, + EndsAt: s.EndsAt, + UpdatedAt: s.UpdatedAt, + Status: types.SilenceStatus{ + State: types.CalcSilenceState(s.StartsAt, s.EndsAt), + }, + Comment: s.Comment, + CreatedBy: s.CreatedBy, + } + for _, m := range s.Matchers { + var t labels.MatchType + switch m.Type { + case silencepb.Matcher_EQUAL: + t = labels.MatchEqual + case silencepb.Matcher_REGEXP: + t = labels.MatchRegexp + case silencepb.Matcher_NOT_EQUAL: + t = labels.MatchNotEqual + case silencepb.Matcher_NOT_REGEXP: + t = labels.MatchNotRegexp + } + matcher, err := labels.NewMatcher(t, m.Name, m.Pattern) + if err != nil { + return nil, err + } + + sil.Matchers = append(sil.Matchers, matcher) + } + + return sil, nil +} + +func matchFilterLabels(matchers []*labels.Matcher, sms map[string]string) bool { + for _, m := range matchers { + v, prs := sms[m.Name] + switch m.Type { + case labels.MatchNotRegexp, labels.MatchNotEqual: + if m.Value == "" && prs { + continue + } + if !m.Matches(v) { + return false + } + default: + if m.Value == "" && !prs { + continue + } + if !m.Matches(v) { + return false + } + } + } + + return true +} From 434e614053612c393ea86135420742b1648d29a3 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 17 Mar 2021 22:08:33 +0530 Subject: [PATCH 02/85] AlertingNG: Fix the alerting stage for legacy alerts (#32025) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 91fc4ab6..da0c29a1 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -39,14 +39,6 @@ type Alertmanager struct { wg sync.WaitGroup } -type WithReceiverStage struct { -} - -func (s *WithReceiverStage) Exec(ctx context.Context, l gokit_log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { - //TODO: Alerts with a receiver should be handled here. - return ctx, nil, nil -} - func init() { registry.RegisterService(&Alertmanager{}) } @@ -66,11 +58,6 @@ func (am *Alertmanager) Run(ctx context.Context) error { am.marker = types.NewMarker(prometheus.DefaultRegisterer) var err error - am.alerts, err = NewAlertProvider(&WithReceiverStage{}, am.marker, gokit_log.NewNopLogger()) - if err != nil { - return errors.Wrap(err, "failed to initialize alerting storage component") - } - am.silences, err = silence.New(silence.Options{ SnapshotFile: filepath.Join("dir", "silences"), //TODO: This is a setting Retention: time.Hour * 24, //TODO: This is also a setting @@ -98,6 +85,12 @@ func (am *Alertmanager) Run(ctx context.Context) error { stage := createReceiverStage(name, receivers[name], waitFunc, am.notificationLog) routingStage[name] = notify.MultiStage{silencingStage, stage} } + + am.alerts, err = NewAlertProvider(routingStage, am.marker, gokit_log.NewNopLogger()) + if err != nil { + return errors.Wrap(err, "failed to initialize alerting storage component") + } + am.dispatcher = dispatch.NewDispatcher(am.alerts, BuildRoutingConfiguration(), routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) } From ed825a4a0cd1af69bb12498e4086b5ab7fb89cf8 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Fri, 19 Mar 2021 13:56:00 +0530 Subject: [PATCH 03/85] AlertingNG: Refactor notifier to support config reloads (#32099) * AlertingNG: Refactor notifier to support config reloads Signed-off-by: Ganesh Vernekar * Fix review comments and make reloading of config a sync operation Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 117 ++++++++++++------ 1 file changed, 82 insertions(+), 35 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index da0c29a1..dceb95f0 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -20,6 +20,7 @@ import ( "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" + "github.com/grafana/alerting-api/pkg/api" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/registry" "github.com/grafana/grafana/pkg/setting" @@ -31,12 +32,14 @@ type Alertmanager struct { // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log // silences keeps the track of which notifications we should not fire due to user configuration. - silences *silence.Silences - marker types.Marker - alerts *AlertProvider - dispatcher *dispatch.Dispatcher + silences *silence.Silences + marker types.Marker + alerts *AlertProvider - wg sync.WaitGroup + dispatcher *dispatch.Dispatcher + dispatcherWG sync.WaitGroup + + reloadConfigMtx sync.Mutex } func init() { @@ -47,17 +50,18 @@ func (am *Alertmanager) IsDisabled() bool { return !setting.AlertingEnabled || !setting.ExecuteAlerts } -func (am *Alertmanager) Init() error { +func (am *Alertmanager) Init() (err error) { am.logger = log.New("alertmanager") - - return nil -} - -func (am *Alertmanager) Run(ctx context.Context) error { //TODO: Speak with David Parrot wrt to the marker, we'll probably need our own. am.marker = types.NewMarker(prometheus.DefaultRegisterer) - var err error + am.notificationLog, err = nflog.New( + nflog.WithRetention(time.Hour*24), //TODO: This is a setting. + nflog.WithSnapshot(filepath.Join("dir", "notifications")), //TODO: This should be a setting + ) + if err != nil { + return errors.Wrap(err, "unable to initialize the notification log component of alerting") + } am.silences, err = silence.New(silence.Options{ SnapshotFile: filepath.Join("dir", "silences"), //TODO: This is a setting Retention: time.Hour * 24, //TODO: This is also a setting @@ -66,36 +70,79 @@ func (am *Alertmanager) Run(ctx context.Context) error { return errors.Wrap(err, "unable to initialize the silencing component of alerting") } - am.notificationLog, err = nflog.New( - nflog.WithRetention(time.Hour*24), //TODO: This is a setting. - nflog.WithSnapshot(filepath.Join("dir", "notifications")), //TODO: This should be a setting - ) - if err != nil { - return errors.Wrap(err, "unable to initialize the notification log component of alerting") + return nil +} + +func (am *Alertmanager) Run(ctx context.Context) error { + // Make sure dispatcher starts. We can tolerate future reload failures. + if err := am.ReloadConfigFromDatabase(); err != nil { + return err + } + for { + select { + case <-ctx.Done(): + am.StopAndWait() + return nil + case <-time.After(1 * time.Minute): + // TODO: once we have a check to skip reload on same config, uncomment this. + //if err := am.ReloadConfigFromDatabase(); err != nil { + // am.logger.Error("failed to sync config from database", "error", err) + //} + } } +} - { - // Now, let's put together our notification pipeline - receivers := buildIntegrationsMap() - routingStage := make(notify.RoutingStage, len(receivers)) +func (am *Alertmanager) StopAndWait() { + if am.dispatcher != nil { + am.dispatcher.Stop() + } + am.dispatcherWG.Wait() +} - silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) - //TODO: We need to unify these receivers - for name := range receivers { - stage := createReceiverStage(name, receivers[name], waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{silencingStage, stage} - } +// ReloadConfigFromDatabase picks the latest config from database and restarts +// the components with the new config. +func (am *Alertmanager) ReloadConfigFromDatabase() error { + am.reloadConfigMtx.Lock() + defer am.reloadConfigMtx.Unlock() - am.alerts, err = NewAlertProvider(routingStage, am.marker, gokit_log.NewNopLogger()) - if err != nil { - return errors.Wrap(err, "failed to initialize alerting storage component") - } + // TODO: check if config is same as before using hashes and skip reload in case they are same. + cfg, err := getConfigFromDatabase() + if err != nil { + return errors.Wrap(err, "get config from database") + } + return errors.Wrap(am.ApplyConfig(cfg), "reload from config") +} - am.dispatcher = dispatch.NewDispatcher(am.alerts, BuildRoutingConfiguration(), routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) +func getConfigFromDatabase() (*api.PostableApiAlertingConfig, error) { + // TODO: get configs from the database. + return &api.PostableApiAlertingConfig{}, nil +} + +// ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. +// It is not safe to call concurrently. +func (am *Alertmanager) ApplyConfig(cfg *api.PostableApiAlertingConfig) error { + // Now, let's put together our notification pipeline + receivers := buildIntegrationsMap() + routingStage := make(notify.RoutingStage, len(receivers)) + + silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) + //TODO: We need to unify these receivers + for name := range receivers { + stage := createReceiverStage(name, receivers[name], waitFunc, am.notificationLog) + routingStage[name] = notify.MultiStage{silencingStage, stage} } - am.wg.Add(1) - go am.dispatcher.Run() + am.alerts.SetStage(routingStage) + + am.StopAndWait() + am.dispatcher = dispatch.NewDispatcher(am.alerts, BuildRoutingConfiguration(), routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) + + am.dispatcherWG.Add(1) + go func() { + defer am.dispatcherWG.Done() + am.dispatcher.Run() + }() + return nil } From 7f0565e41ae22e68a676561dfb3b3bd9358d0432 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Mar 2021 14:20:44 +0000 Subject: [PATCH 04/85] Alerting: Fetch configuration from the database and run a notification service (#32175) * Alerting: Fetch configuration from the database and run a notification instance Co-Authored-By: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> --- pkg/services/ngalert/notifier/alertmanager.go | 138 ++++++++++++++---- 1 file changed, 108 insertions(+), 30 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index dceb95f0..476b91bf 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -9,7 +9,6 @@ import ( gokit_log "github.com/go-kit/kit/log" "github.com/pkg/errors" - "github.com/prometheus/alertmanager/config" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/nflog" "github.com/prometheus/alertmanager/nflog/nflogpb" @@ -17,17 +16,30 @@ import ( "github.com/prometheus/alertmanager/pkg/labels" "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/silence/silencepb" + "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" "github.com/grafana/alerting-api/pkg/api" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/registry" + "github.com/grafana/grafana/pkg/services/ngalert/models" + "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" + "github.com/grafana/grafana/pkg/services/ngalert/store" + "github.com/grafana/grafana/pkg/services/sqlstore" + "github.com/grafana/grafana/pkg/services/sqlstore/migrator" "github.com/grafana/grafana/pkg/setting" ) +const ( + workingDir = "alerting" +) + type Alertmanager struct { - logger log.Logger + logger log.Logger + Settings *setting.Cfg `inject:""` + SQLStore *sqlstore.SQLStore `inject:""` + Store store.AlertingStore // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log @@ -52,8 +64,9 @@ func (am *Alertmanager) IsDisabled() bool { func (am *Alertmanager) Init() (err error) { am.logger = log.New("alertmanager") - //TODO: Speak with David Parrot wrt to the marker, we'll probably need our own. - am.marker = types.NewMarker(prometheus.DefaultRegisterer) + r := prometheus.NewRegistry() + am.marker = types.NewMarker(r) + am.Store = store.DBstore{SQLStore: am.SQLStore} am.notificationLog, err = nflog.New( nflog.WithRetention(time.Hour*24), //TODO: This is a setting. @@ -70,14 +83,20 @@ func (am *Alertmanager) Init() (err error) { return errors.Wrap(err, "unable to initialize the silencing component of alerting") } + am.alerts, err = NewAlertProvider(nil, am.marker) + if err != nil { + return errors.Wrap(err, "unable to initialize the alert provider component of alerting") + } + return nil } func (am *Alertmanager) Run(ctx context.Context) error { // Make sure dispatcher starts. We can tolerate future reload failures. - if err := am.ReloadConfigFromDatabase(); err != nil { + if err := am.SyncAndApplyConfigFromDatabase(); err != nil && !errors.Is(err, store.ErrNoAlertmanagerConfiguration) { return err } + for { select { case <-ctx.Done(): @@ -85,13 +104,21 @@ func (am *Alertmanager) Run(ctx context.Context) error { return nil case <-time.After(1 * time.Minute): // TODO: once we have a check to skip reload on same config, uncomment this. - //if err := am.ReloadConfigFromDatabase(); err != nil { - // am.logger.Error("failed to sync config from database", "error", err) + //if err := am.SyncAndApplyConfigFromDatabase(); err != nil { + // if err == store.ErrNoAlertmanagerConfiguration { + // am.logger.Warn(errors.Wrap(err, "unable to sync configuration").Error()) + // } + // am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) //} } } } +// AddMigration runs the database migrations as the service starts. +func (am *Alertmanager) AddMigration(mg *migrator.Migrator) { + alertmanagerConfigurationMigration(mg) +} + func (am *Alertmanager) StopAndWait() { if am.dispatcher != nil { am.dispatcher.Stop() @@ -99,43 +126,67 @@ func (am *Alertmanager) StopAndWait() { am.dispatcherWG.Wait() } -// ReloadConfigFromDatabase picks the latest config from database and restarts +// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts // the components with the new config. -func (am *Alertmanager) ReloadConfigFromDatabase() error { +func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() // TODO: check if config is same as before using hashes and skip reload in case they are same. - cfg, err := getConfigFromDatabase() + cfg, err := am.getConfigFromDatabase() if err != nil { return errors.Wrap(err, "get config from database") } return errors.Wrap(am.ApplyConfig(cfg), "reload from config") } -func getConfigFromDatabase() (*api.PostableApiAlertingConfig, error) { - // TODO: get configs from the database. - return &api.PostableApiAlertingConfig{}, nil +func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) { + // First, let's get the configuration we need from the database. + q := &models.GetLatestAlertmanagerConfigurationQuery{} + if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { + return nil, err + } + + // Then, let's parse and return the alertmanager configuration. + return Load(q.Result.AlertmanagerConfiguration) } // ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. -func (am *Alertmanager) ApplyConfig(cfg *api.PostableApiAlertingConfig) error { +func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error { + // First, we need to make sure we persist the templates to disk. + paths, _, err := PersistTemplates(cfg, am.WorkingDirPath()) + if err != nil { + return err + } + + // With the templates persisted, create the template list using the paths. + tmpl, err := template.FromGlobs(paths...) + if err != nil { + return err + } + + // Finally, build the integrations map using the receiver configuration and templates. + integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl) + if err != nil { + return err + } // Now, let's put together our notification pipeline - receivers := buildIntegrationsMap() - routingStage := make(notify.RoutingStage, len(receivers)) + routingStage := make(notify.RoutingStage, len(integrationsMap)) silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) - //TODO: We need to unify these receivers - for name := range receivers { - stage := createReceiverStage(name, receivers[name], waitFunc, am.notificationLog) + for name := range integrationsMap { + stage := createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) routingStage[name] = notify.MultiStage{silencingStage, stage} } am.alerts.SetStage(routingStage) am.StopAndWait() - am.dispatcher = dispatch.NewDispatcher(am.alerts, BuildRoutingConfiguration(), routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) + //TODO: Verify this is correct + route := dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) + //TODO: This needs the metrics + am.dispatcher = dispatch.NewDispatcher(am.alerts, route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) am.dispatcherWG.Add(1) go func() { @@ -146,6 +197,43 @@ func (am *Alertmanager) ApplyConfig(cfg *api.PostableApiAlertingConfig) error { return nil } +func (am *Alertmanager) WorkingDirPath() string { + return filepath.Join(am.Settings.DataPath, workingDir) +} + +// buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config. +func (am *Alertmanager) buildIntegrationsMap(receivers []*api.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) { + integrationsMap := make(map[string][]notify.Integration, len(receivers)) + for _, receiver := range receivers { + integrations, err := am.buildReceiverIntegrations(receiver, templates) + if err != nil { + return nil, err + } + integrationsMap[receiver.Name] = integrations + } + + return integrationsMap, nil +} + +// buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. +func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiReceiver, _ *template.Template) ([]notify.Integration, error) { + var integrations []notify.Integration + + for i, r := range receiver.GrafanaManagedReceivers { + switch r.Type { + case "email": + n, err := channels.NewEmailNotifier(r.Result) + if err != nil { + return nil, err + } + + integrations = append(integrations, notify.NewIntegration(n, n, r.Name, i)) + } + } + + return integrations, nil +} + // CreateAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not func (am *Alertmanager) CreateAlerts(alerts ...*PostableAlert) error { return am.alerts.PutPostableAlert(alerts...) @@ -232,16 +320,6 @@ func createReceiverStage(name string, integrations []notify.Integration, wait fu return fs } -// BuildRoutingConfiguration produces an alertmanager-based routing configuration. -func BuildRoutingConfiguration() *dispatch.Route { - var cfg *config.Config - return dispatch.NewRoute(cfg.Route, nil) -} - -func buildIntegrationsMap() map[string][]notify.Integration { - return map[string][]notify.Integration{} -} - func waitFunc() time.Duration { return setting.AlertingNotificationTimeout } From 55c76f869cb41bb3dd0dc9b9d52536fb383bec24 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 25 Mar 2021 17:21:44 +0530 Subject: [PATCH 05/85] Upgrade Prometheus Alertmanager and small fixes (#32280) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 476b91bf..2dfbec51 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -8,6 +8,7 @@ import ( "time" gokit_log "github.com/go-kit/kit/log" + "github.com/grafana/alerting-api/pkg/api" "github.com/pkg/errors" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/nflog" @@ -20,7 +21,6 @@ import ( "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" - "github.com/grafana/alerting-api/pkg/api" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/registry" "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -51,6 +51,8 @@ type Alertmanager struct { dispatcher *dispatch.Dispatcher dispatcherWG sync.WaitGroup + stageMetrics *notify.Metrics + reloadConfigMtx sync.Mutex } @@ -66,6 +68,7 @@ func (am *Alertmanager) Init() (err error) { am.logger = log.New("alertmanager") r := prometheus.NewRegistry() am.marker = types.NewMarker(r) + am.stageMetrics = notify.NewMetrics(r) am.Store = store.DBstore{SQLStore: am.SQLStore} am.notificationLog, err = nflog.New( @@ -137,7 +140,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { if err != nil { return errors.Wrap(err, "get config from database") } - return errors.Wrap(am.ApplyConfig(cfg), "reload from config") + return errors.Wrap(am.applyConfig(cfg), "reload from config") } func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) { @@ -152,8 +155,16 @@ func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) } // ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. -// It is not safe to call concurrently. func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error { + am.reloadConfigMtx.Lock() + defer am.reloadConfigMtx.Unlock() + + return am.applyConfig(cfg) +} + +// applyConfig applies a new configuration by re-initializing all components using the configuration provided. +// It is not safe to call concurrently. +func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { // First, we need to make sure we persist the templates to disk. paths, _, err := PersistTemplates(cfg, am.WorkingDirPath()) if err != nil { @@ -176,7 +187,7 @@ func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error { silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) for name := range integrationsMap { - stage := createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) + stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) routingStage[name] = notify.MultiStage{silencingStage, stage} } @@ -279,7 +290,7 @@ func (am *Alertmanager) ListSilences(matchers []*labels.Matcher) ([]types.Silenc return active[i].EndsAt.Before(active[j].EndsAt) }) sort.Slice(pending, func(i int, j int) bool { - return pending[i].StartsAt.Before(pending[j].EndsAt) + return pending[i].EndsAt.Before(pending[j].EndsAt) }) sort.Slice(expired, func(i int, j int) bool { return expired[i].EndsAt.After(expired[j].EndsAt) @@ -300,7 +311,7 @@ func (am *Alertmanager) CreateSilence(silence *types.Silence) {} func (am *Alertmanager) DeleteSilence(silence *types.Silence) {} // createReceiverStage creates a pipeline of stages for a receiver. -func createReceiverStage(name string, integrations []notify.Integration, wait func() time.Duration, notificationLog notify.NotificationLog) notify.Stage { +func (am *Alertmanager) createReceiverStage(name string, integrations []notify.Integration, wait func() time.Duration, notificationLog notify.NotificationLog) notify.Stage { var fs notify.FanoutStage for i := range integrations { recv := &nflogpb.Receiver{ @@ -312,7 +323,7 @@ func createReceiverStage(name string, integrations []notify.Integration, wait fu s = append(s, notify.NewWaitStage(wait)) s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv)) //TODO: This probably won't work w/o the metrics - s = append(s, notify.NewRetryStage(integrations[i], name, nil)) + s = append(s, notify.NewRetryStage(integrations[i], name, am.stageMetrics)) s = append(s, notify.NewSetNotifiesStage(notificationLog, recv)) fs = append(fs, s) From 7b4cefa6b9f594d9227510d453f44c6d0fbab5e5 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 29 Mar 2021 20:35:15 +0530 Subject: [PATCH 06/85] AlertingNG: Fix dispatcher metrics in notifier (#32434) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 2dfbec51..099af3d9 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -51,7 +51,8 @@ type Alertmanager struct { dispatcher *dispatch.Dispatcher dispatcherWG sync.WaitGroup - stageMetrics *notify.Metrics + stageMetrics *notify.Metrics + dispatcherMetrics *dispatch.DispatcherMetrics reloadConfigMtx sync.Mutex } @@ -69,6 +70,7 @@ func (am *Alertmanager) Init() (err error) { r := prometheus.NewRegistry() am.marker = types.NewMarker(r) am.stageMetrics = notify.NewMetrics(r) + am.dispatcherMetrics = dispatch.NewDispatcherMetrics(r) am.Store = store.DBstore{SQLStore: am.SQLStore} am.notificationLog, err = nflog.New( @@ -196,8 +198,7 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { am.StopAndWait() //TODO: Verify this is correct route := dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - //TODO: This needs the metrics - am.dispatcher = dispatch.NewDispatcher(am.alerts, route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), nil) + am.dispatcher = dispatch.NewDispatcher(am.alerts, route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) am.dispatcherWG.Add(1) go func() { From fc6f25bd6dcec142fcdac87c9e68f3b0dc0098ca Mon Sep 17 00:00:00 2001 From: David Parrott Date: Tue, 30 Mar 2021 09:37:56 -0700 Subject: [PATCH 07/85] Alerting: Send alerts from state tracker to notifier, logging, and cleanup task (#32333) * Initial commit for state tracking * basic state transition logic and tests * constructor. test and interface fixup * use new sig for sch.definitionRoutine() * test fixup * make the linter happy * more minor linting cleanup * Alerting: Send alerts from state tracker to notifier * Add evaluation time and test Add evaluation time and test * Add cleanup routine and logging * Pull in compact.go and reconcile differences * pr feedback * pr feedback Pull in compact.go and reconcile differences Co-authored-by: Josue Abreu --- pkg/services/ngalert/notifier/alertmanager.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 099af3d9..a9ab2cf4 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -246,8 +246,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiRecei return integrations, nil } -// CreateAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not -func (am *Alertmanager) CreateAlerts(alerts ...*PostableAlert) error { +// PutAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not +func (am *Alertmanager) PutAlerts(alerts ...*PostableAlert) error { return am.alerts.PutPostableAlert(alerts...) } From 87cd3cb0f4188cf00aae8390a283e970c2fd8f58 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 31 Mar 2021 12:36:36 +0100 Subject: [PATCH 08/85] Alerting: Introduce the silencing interface (#32517) * Alerting: Introduce the silencing interface The operations introduced are: - Listing silences - Retrieving an specific silence - Deleting a silence - Creating a silence Signed-off-by: Josue Abreu * Add a comment to listing silences * Update to upstream alertmanager * Remove copied code from the Alertmanager --- pkg/services/ngalert/notifier/alertmanager.go | 133 +----------------- 1 file changed, 6 insertions(+), 127 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index a9ab2cf4..43df284a 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -3,7 +3,6 @@ package notifier import ( "context" "path/filepath" - "sort" "sync" "time" @@ -14,9 +13,7 @@ import ( "github.com/prometheus/alertmanager/nflog" "github.com/prometheus/alertmanager/nflog/nflogpb" "github.com/prometheus/alertmanager/notify" - "github.com/prometheus/alertmanager/pkg/labels" "github.com/prometheus/alertmanager/silence" - "github.com/prometheus/alertmanager/silence/silencepb" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" @@ -33,6 +30,8 @@ import ( const ( workingDir = "alerting" + // How long should we keep silences and notification entries on-disk after they've served their purpose. + retentionNotificationsAndSilences = 5 * 24 * time.Hour ) type Alertmanager struct { @@ -74,15 +73,15 @@ func (am *Alertmanager) Init() (err error) { am.Store = store.DBstore{SQLStore: am.SQLStore} am.notificationLog, err = nflog.New( - nflog.WithRetention(time.Hour*24), //TODO: This is a setting. - nflog.WithSnapshot(filepath.Join("dir", "notifications")), //TODO: This should be a setting + nflog.WithRetention(retentionNotificationsAndSilences), + nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")), ) if err != nil { return errors.Wrap(err, "unable to initialize the notification log component of alerting") } am.silences, err = silence.New(silence.Options{ - SnapshotFile: filepath.Join("dir", "silences"), //TODO: This is a setting - Retention: time.Hour * 24, //TODO: This is also a setting + SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"), + Retention: retentionNotificationsAndSilences, }) if err != nil { return errors.Wrap(err, "unable to initialize the silencing component of alerting") @@ -251,66 +250,6 @@ func (am *Alertmanager) PutAlerts(alerts ...*PostableAlert) error { return am.alerts.PutPostableAlert(alerts...) } -func (am *Alertmanager) ListSilences(matchers []*labels.Matcher) ([]types.Silence, error) { - pbsilences, _, err := am.silences.Query() - if err != nil { - return nil, errors.Wrap(err, "unable to query for the list of silences") - } - r := []types.Silence{} - for _, pbs := range pbsilences { - s, err := silenceFromProto(pbs) - if err != nil { - return nil, errors.Wrap(err, "unable to marshal silence") - } - - sms := make(map[string]string) - for _, m := range s.Matchers { - sms[m.Name] = m.Value - } - - if !matchFilterLabels(matchers, sms) { - continue - } - - r = append(r, *s) - } - - var active, pending, expired []types.Silence - for _, s := range r { - switch s.Status.State { - case types.SilenceStateActive: - active = append(active, s) - case types.SilenceStatePending: - pending = append(pending, s) - case types.SilenceStateExpired: - expired = append(expired, s) - } - } - - sort.Slice(active, func(i int, j int) bool { - return active[i].EndsAt.Before(active[j].EndsAt) - }) - sort.Slice(pending, func(i int, j int) bool { - return pending[i].EndsAt.Before(pending[j].EndsAt) - }) - sort.Slice(expired, func(i int, j int) bool { - return expired[i].EndsAt.After(expired[j].EndsAt) - }) - - // Initialize silences explicitly to an empty list (instead of nil) - // So that it does not get converted to "null" in JSON. - silences := []types.Silence{} - silences = append(silences, active...) - silences = append(silences, pending...) - silences = append(silences, expired...) - - return silences, nil -} - -func (am *Alertmanager) GetSilence(silence *types.Silence) {} -func (am *Alertmanager) CreateSilence(silence *types.Silence) {} -func (am *Alertmanager) DeleteSilence(silence *types.Silence) {} - // createReceiverStage creates a pipeline of stages for a receiver. func (am *Alertmanager) createReceiverStage(name string, integrations []notify.Integration, wait func() time.Duration, notificationLog notify.NotificationLog) notify.Stage { var fs notify.FanoutStage @@ -343,63 +282,3 @@ func timeoutFunc(d time.Duration) time.Duration { } return d + waitFunc() } - -// copied from the Alertmanager -func silenceFromProto(s *silencepb.Silence) (*types.Silence, error) { - sil := &types.Silence{ - ID: s.Id, - StartsAt: s.StartsAt, - EndsAt: s.EndsAt, - UpdatedAt: s.UpdatedAt, - Status: types.SilenceStatus{ - State: types.CalcSilenceState(s.StartsAt, s.EndsAt), - }, - Comment: s.Comment, - CreatedBy: s.CreatedBy, - } - for _, m := range s.Matchers { - var t labels.MatchType - switch m.Type { - case silencepb.Matcher_EQUAL: - t = labels.MatchEqual - case silencepb.Matcher_REGEXP: - t = labels.MatchRegexp - case silencepb.Matcher_NOT_EQUAL: - t = labels.MatchNotEqual - case silencepb.Matcher_NOT_REGEXP: - t = labels.MatchNotRegexp - } - matcher, err := labels.NewMatcher(t, m.Name, m.Pattern) - if err != nil { - return nil, err - } - - sil.Matchers = append(sil.Matchers, matcher) - } - - return sil, nil -} - -func matchFilterLabels(matchers []*labels.Matcher, sms map[string]string) bool { - for _, m := range matchers { - v, prs := sms[m.Name] - switch m.Type { - case labels.MatchNotRegexp, labels.MatchNotEqual: - if m.Value == "" && prs { - continue - } - if !m.Matches(v) { - return false - } - default: - if m.Value == "" && !prs { - continue - } - if !m.Matches(v) { - return false - } - } - } - - return true -} From 97d74c2cdafb2ada4b1c45a90a9cab2926e28611 Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Wed, 31 Mar 2021 23:00:56 +0300 Subject: [PATCH 09/85] [Alerting]: Alertmanager API implementation (#32174) * Add validation for grafana recipient * Alertmanager API implementation (WIP) * Fix encoding/decoding receiver settings from/to YAML * Save templates together with the configuration * update POST to apply latest config * Alertmanager service enabled by the ngalert toggle * Silence API integration with Alertmanager * Apply suggestions from code review Co-authored-by: gotjosh Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> --- pkg/services/ngalert/notifier/alertmanager.go | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 43df284a..3a7cea81 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -2,10 +2,15 @@ package notifier import ( "context" + "fmt" "path/filepath" "sync" "time" + "github.com/grafana/grafana/pkg/components/securejsondata" + + "github.com/grafana/grafana/pkg/models" + gokit_log "github.com/go-kit/kit/log" "github.com/grafana/alerting-api/pkg/api" "github.com/pkg/errors" @@ -20,7 +25,7 @@ import ( "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/registry" - "github.com/grafana/grafana/pkg/services/ngalert/models" + ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/services/sqlstore" @@ -61,7 +66,10 @@ func init() { } func (am *Alertmanager) IsDisabled() bool { - return !setting.AlertingEnabled || !setting.ExecuteAlerts + if am.Settings == nil { + return true + } + return !am.Settings.IsNgAlertEnabled() } func (am *Alertmanager) Init() (err error) { @@ -146,7 +154,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) { // First, let's get the configuration we need from the database. - q := &models.GetLatestAlertmanagerConfigurationQuery{} + q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{} if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { return nil, err } @@ -233,7 +241,22 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiRecei for i, r := range receiver.GrafanaManagedReceivers { switch r.Type { case "email": - n, err := channels.NewEmailNotifier(r.Result) + frequency, err := time.ParseDuration(r.Frequency) + if err != nil { + return nil, fmt.Errorf("unable to parse receiver frequency %s, %w", r.Frequency, err) + } + notification := models.AlertNotification{ + Uid: r.Uid, + Name: r.Name, + Type: r.Type, + IsDefault: r.IsDefault, + SendReminder: r.SendReminder, + DisableResolveMessage: r.DisableResolveMessage, + Frequency: frequency, + Settings: r.Settings, + SecureSettings: securejsondata.GetEncryptedJsonData(r.SecureSettings), + } + n, err := channels.NewEmailNotifier(¬ification) if err != nil { return nil, err } From 90f541533a4735f2a29662a8aae6406ad35fc7e8 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 8 Apr 2021 12:27:59 +0100 Subject: [PATCH 10/85] Alerting: Allow querying of Alerts from notifications (#32614) * Alerting: Allow querying of Alerts from notifications * Wire everything up * Remove unused functions * Remove duplicate line --- pkg/services/ngalert/notifier/alertmanager.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 3a7cea81..95161cb4 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -48,17 +48,18 @@ type Alertmanager struct { // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log // silences keeps the track of which notifications we should not fire due to user configuration. - silences *silence.Silences - marker types.Marker - alerts *AlertProvider - + silencer *silence.Silencer + silences *silence.Silences + marker types.Marker + alerts *AlertProvider + route *dispatch.Route dispatcher *dispatch.Dispatcher dispatcherWG sync.WaitGroup stageMetrics *notify.Metrics dispatcherMetrics *dispatch.DispatcherMetrics - reloadConfigMtx sync.Mutex + reloadConfigMtx sync.RWMutex } func init() { @@ -194,7 +195,8 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { // Now, let's put together our notification pipeline routingStage := make(notify.RoutingStage, len(integrationsMap)) - silencingStage := notify.NewMuteStage(silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger())) + am.silencer = silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger()) + silencingStage := notify.NewMuteStage(am.silencer) for name := range integrationsMap { stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) routingStage[name] = notify.MultiStage{silencingStage, stage} @@ -203,9 +205,8 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { am.alerts.SetStage(routingStage) am.StopAndWait() - //TODO: Verify this is correct - route := dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) + am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) am.dispatcherWG.Add(1) go func() { @@ -285,7 +286,6 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I var s notify.MultiStage s = append(s, notify.NewWaitStage(wait)) s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv)) - //TODO: This probably won't work w/o the metrics s = append(s, notify.NewRetryStage(integrations[i], name, am.stageMetrics)) s = append(s, notify.NewSetNotifiesStage(notificationLog, recv)) From 7e5c1bfbc9bf9f228a72ef1a20a8286ef5fbca5e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 8 Apr 2021 22:01:23 +0530 Subject: [PATCH 11/85] AlertingNG: Add a global registry for notification channels (#32781) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 95161cb4..458c7024 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -25,6 +25,7 @@ import ( "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/registry" + "github.com/grafana/grafana/pkg/services/alerting" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" @@ -305,3 +306,29 @@ func timeoutFunc(d time.Duration) time.Duration { } return d + waitFunc() } + +// GetAvailableNotifiers returns the metadata of all the notification channels that can be configured. +func (am *Alertmanager) GetAvailableNotifiers() []*alerting.NotifierPlugin { + return []*alerting.NotifierPlugin{ + { + Type: "email", + Name: "Email", + Description: "Sends notifications using Grafana server configured SMTP settings", + Heading: "Email settings", + Options: []alerting.NotifierOption{ + { + Label: "Single email", + Description: "Send a single email to all recipients", + Element: alerting.ElementTypeCheckbox, + PropertyName: "singleEmail", + }, { + Label: "Addresses", + Description: "You can enter multiple email addresses using a \";\" separator", + Element: alerting.ElementTypeTextArea, + PropertyName: "addresses", + Required: true, + }, + }, + }, + } +} From 11175fe6c3a8f84f7973a05c40b11cae466c3d60 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 8 Apr 2021 22:51:09 +0530 Subject: [PATCH 12/85] AlertingNG: PagerDuty notification channel (#32604) * AlertingNG: PagerDuty notification channel Signed-off-by: Ganesh Vernekar * Add tests Signed-off-by: Ganesh Vernekar * Fix lint Signed-off-by: Ganesh Vernekar * Fix reviews Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 458c7024..c0e8c200 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -2,15 +2,11 @@ package notifier import ( "context" - "fmt" + "net/url" "path/filepath" "sync" "time" - "github.com/grafana/grafana/pkg/components/securejsondata" - - "github.com/grafana/grafana/pkg/models" - gokit_log "github.com/go-kit/kit/log" "github.com/grafana/alerting-api/pkg/api" "github.com/pkg/errors" @@ -23,7 +19,9 @@ import ( "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" + "github.com/grafana/grafana/pkg/components/securejsondata" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/models" "github.com/grafana/grafana/pkg/registry" "github.com/grafana/grafana/pkg/services/alerting" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -173,6 +171,8 @@ func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error { return am.applyConfig(cfg) } +const defaultTemplate = "templates/default.tmpl" + // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { @@ -182,6 +182,8 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { return err } + paths = append([]string{defaultTemplate}, paths...) + // With the templates persisted, create the template list using the paths. tmpl, err := template.FromGlobs(paths...) if err != nil { @@ -236,35 +238,44 @@ func (am *Alertmanager) buildIntegrationsMap(receivers []*api.PostableApiReceive return integrationsMap, nil } +type NotificationChannel interface { + notify.Notifier + notify.ResolvedSender +} + // buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. -func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiReceiver, _ *template.Template) ([]notify.Integration, error) { +func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { var integrations []notify.Integration for i, r := range receiver.GrafanaManagedReceivers { - switch r.Type { - case "email": - frequency, err := time.ParseDuration(r.Frequency) - if err != nil { - return nil, fmt.Errorf("unable to parse receiver frequency %s, %w", r.Frequency, err) - } - notification := models.AlertNotification{ + var ( + cfg = &models.AlertNotification{ Uid: r.Uid, Name: r.Name, Type: r.Type, IsDefault: r.IsDefault, SendReminder: r.SendReminder, DisableResolveMessage: r.DisableResolveMessage, - Frequency: frequency, Settings: r.Settings, SecureSettings: securejsondata.GetEncryptedJsonData(r.SecureSettings), } - n, err := channels.NewEmailNotifier(¬ification) - if err != nil { - return nil, err - } - - integrations = append(integrations, notify.NewIntegration(n, n, r.Name, i)) + n NotificationChannel + err error + ) + externalURL, err := url.Parse(am.Settings.AppURL) + if err != nil { + return nil, err + } + switch r.Type { + case "email": + n, err = channels.NewEmailNotifier(cfg, externalURL) + case "pagerduty": + n, err = channels.NewPagerdutyNotifier(cfg, tmpl, externalURL) + } + if err != nil { + return nil, err } + integrations = append(integrations, notify.NewIntegration(n, n, r.Name, i)) } return integrations, nil From e4b2f4f425b8ba90bd36f623317a3825fadadd6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torkel=20=C3=96degaard?= Date: Thu, 8 Apr 2021 19:49:43 +0200 Subject: [PATCH 13/85] Packages: Tyring to fix missing css prop issue caused by emotion (#32806) From 14866d1103102e35da939c217118c9508342ed66 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 13 Apr 2021 13:02:44 +0100 Subject: [PATCH 14/85] Alerting: Use a default configuration and periodically poll for new ones (#32851) * Alerting: Use a default configuration and periodically poll for new ones Use a default configuration to make sure we always start the grafana instance. Then, regularly poll for new ones. I've also made sure that failures to apply configuration do not stop the Grafana server but instead keep polling until it is a success. --- pkg/services/ngalert/notifier/alertmanager.go | 123 +++++++++++++----- 1 file changed, 93 insertions(+), 30 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c0e8c200..7ced46b1 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -2,13 +2,16 @@ package notifier import ( "context" + "crypto/md5" + "encoding/json" + "fmt" "net/url" "path/filepath" "sync" "time" gokit_log "github.com/go-kit/kit/log" - "github.com/grafana/alerting-api/pkg/api" + apimodels "github.com/grafana/alerting-api/pkg/api" "github.com/pkg/errors" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/nflog" @@ -33,9 +36,33 @@ import ( ) const ( - workingDir = "alerting" + pollInterval = 1 * time.Minute + workingDir = "alerting" // How long should we keep silences and notification entries on-disk after they've served their purpose. retentionNotificationsAndSilences = 5 * 24 * time.Hour + // To start, the alertmanager needs at least one route defined. + // TODO: we should move this to Grafana settings and define this as the default. + alertmanagerDefaultConfiguration = ` +{ + "alertmanager_config": { + "route": { + "receiver": "grafana-default-email" + }, + "receivers": [{ + "name": "grafana-default-email", + "grafana_managed_receiver_configs": [{ + "uid": "", + "name": "email receiver", + "type": "email", + "isDefault": true, + "settings": { + "addresses": "" + } + }] + }] + } +} +` ) type Alertmanager struct { @@ -59,6 +86,7 @@ type Alertmanager struct { dispatcherMetrics *dispatch.DispatcherMetrics reloadConfigMtx sync.RWMutex + config []byte } func init() { @@ -105,8 +133,8 @@ func (am *Alertmanager) Init() (err error) { func (am *Alertmanager) Run(ctx context.Context) error { // Make sure dispatcher starts. We can tolerate future reload failures. - if err := am.SyncAndApplyConfigFromDatabase(); err != nil && !errors.Is(err, store.ErrNoAlertmanagerConfiguration) { - return err + if err := am.SyncAndApplyConfigFromDatabase(); err != nil { + am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) } for { @@ -114,14 +142,10 @@ func (am *Alertmanager) Run(ctx context.Context) error { case <-ctx.Done(): am.StopAndWait() return nil - case <-time.After(1 * time.Minute): - // TODO: once we have a check to skip reload on same config, uncomment this. - //if err := am.SyncAndApplyConfigFromDatabase(); err != nil { - // if err == store.ErrNoAlertmanagerConfiguration { - // am.logger.Warn(errors.Wrap(err, "unable to sync configuration").Error()) - // } - // am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) - //} + case <-time.After(pollInterval): + if err := am.SyncAndApplyConfigFromDatabase(); err != nil { + am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) + } } } } @@ -138,33 +162,54 @@ func (am *Alertmanager) StopAndWait() { am.dispatcherWG.Wait() } -// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts -// the components with the new config. -func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { +func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { + rawConfig, err := json.Marshal(&cfg) + if err != nil { + return errors.Wrap(err, "failed to serialize to the Alertmanager configuration") + } + am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() - // TODO: check if config is same as before using hashes and skip reload in case they are same. - cfg, err := am.getConfigFromDatabase() - if err != nil { - return errors.Wrap(err, "get config from database") + cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ + AlertmanagerConfiguration: string(rawConfig), + ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + } + + if err := am.Store.SaveAlertmanagerConfiguration(cmd); err != nil { + return errors.Wrap(err, "failed to save Alertmanager configuration") } - return errors.Wrap(am.applyConfig(cfg), "reload from config") + + return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration") } -func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) { +// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts +// the components with the new config. +func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { + am.reloadConfigMtx.Lock() + defer am.reloadConfigMtx.Unlock() + // First, let's get the configuration we need from the database. q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{} if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { - return nil, err + // If there's no configuration in the database, let's use the default configuration. + if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { + q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration} + } else { + return errors.Wrap(err, "unable to get Alertmanager configuration from the database") + } + } + + cfg, err := Load([]byte(q.Result.AlertmanagerConfiguration)) + if err != nil { + return err } - // Then, let's parse and return the alertmanager configuration. - return Load(q.Result.AlertmanagerConfiguration) + return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration") } // ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. -func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error { +func (am *Alertmanager) ApplyConfig(cfg *apimodels.PostableUserConfig) error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() @@ -175,13 +220,30 @@ const defaultTemplate = "templates/default.tmpl" // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. -func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { - // First, we need to make sure we persist the templates to disk. - paths, _, err := PersistTemplates(cfg, am.WorkingDirPath()) +func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { + // First, let's make sure this config is not already loaded + var configChanged bool + rawConfig, err := json.Marshal(cfg.AlertmanagerConfig) + if err != nil { + // In theory, this should never happen. + return err + } + + if md5.Sum(am.config) != md5.Sum(rawConfig) { + configChanged = true + } + // next, we need to make sure we persist the templates to disk. + paths, templatesChanged, err := PersistTemplates(cfg, am.WorkingDirPath()) if err != nil { return err } + // If neither the configuration nor templates have changed, we've got nothing to do. + if !configChanged && !templatesChanged { + am.logger.Debug("neither config nor template have changed, skipping configuration sync.") + return nil + } + paths = append([]string{defaultTemplate}, paths...) // With the templates persisted, create the template list using the paths. @@ -217,6 +279,7 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error { am.dispatcher.Run() }() + am.config = rawConfig return nil } @@ -225,7 +288,7 @@ func (am *Alertmanager) WorkingDirPath() string { } // buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config. -func (am *Alertmanager) buildIntegrationsMap(receivers []*api.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) { +func (am *Alertmanager) buildIntegrationsMap(receivers []*apimodels.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) { integrationsMap := make(map[string][]notify.Integration, len(receivers)) for _, receiver := range receivers { integrations, err := am.buildReceiverIntegrations(receiver, templates) @@ -244,7 +307,7 @@ type NotificationChannel interface { } // buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. -func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { +func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { var integrations []notify.Integration for i, r := range receiver.GrafanaManagedReceivers { From 4e51a666ff36c7c9acf94acc27623d50a359002f Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 15 Apr 2021 16:01:41 +0530 Subject: [PATCH 15/85] AlertingNG: Slack notification channel (#32675) * AlertingNG: Slack notification channel Signed-off-by: Ganesh Vernekar * Add tests Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar * Fix review comments and small refactoring Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 7ced46b1..c6d449a4 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -334,6 +334,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewEmailNotifier(cfg, externalURL) case "pagerduty": n, err = channels.NewPagerdutyNotifier(cfg, tmpl, externalURL) + case "slack": + n, err = channels.NewSlackNotifier(cfg, tmpl, externalURL) } if err != nil { return nil, err From 8e363cde66715c6721d495a5536e3263d4e6cb9f Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 19 Apr 2021 12:28:44 +0530 Subject: [PATCH 16/85] AlertingNG: Remove the receivers field from postable alerts (#33068) * AlertingNG: Remove the receivers field from postable alerts and update tests Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c6d449a4..ad75044a 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -123,7 +123,7 @@ func (am *Alertmanager) Init() (err error) { return errors.Wrap(err, "unable to initialize the silencing component of alerting") } - am.alerts, err = NewAlertProvider(nil, am.marker) + am.alerts, err = NewAlertProvider(am.marker) if err != nil { return errors.Wrap(err, "unable to initialize the alert provider component of alerting") } @@ -267,8 +267,6 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { routingStage[name] = notify.MultiStage{silencingStage, stage} } - am.alerts.SetStage(routingStage) - am.StopAndWait() am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) @@ -347,8 +345,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp } // PutAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not -func (am *Alertmanager) PutAlerts(alerts ...*PostableAlert) error { - return am.alerts.PutPostableAlert(alerts...) +func (am *Alertmanager) PutAlerts(alerts apimodels.PostableAlerts) error { + return am.alerts.PutPostableAlert(alerts) } // createReceiverStage creates a pipeline of stages for a receiver. From 05c15e3e43088f8a614b310e414d4558c21d26b2 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Mon, 19 Apr 2021 14:26:04 -0400 Subject: [PATCH 17/85] Inhouse alerting api (#33129) * init * autogens AM route * POST dashboards/db spec * POST alert-notifications spec * fix description * re inits vendor, updates grafana to master * go mod updates * alerting routes * renames to receivers * prometheus endpoints * align config endpoint with cortex, include templates * Change grafana receiver type * Update receivers.go * rename struct to stop swagger thrashing * add rules API * index html * standalone swagger ui html page * Update README.md * Expose GrafanaManagedAlert properties * Some fixes - /api/v1/rules/{Namespace} should return a map - update ExtendedUpsertAlertDefinitionCommand properties * am alerts routes * rename prom swagger section for clarity, remove example endpoints * Add missing json and yaml tags * folder perms * make folders POST again * fix grafana receiver type * rename fodler->namespace for perms * make ruler json again * PR fixes * silences * fix Ok -> Ack * Add id to POST /api/v1/silences (#9) Signed-off-by: Ganesh Vernekar * Add POST /api/v1/alerts (#10) Signed-off-by: Ganesh Vernekar * fix silences * Add testing endpoints * removes grpc replace directives * [wip] starts validation * pkg cleanup * go mod tidy * ignores vendor dir * Change response type for Cortex/Loki alerts * receiver unmarshaling tests * ability to split routes between AM & Grafana * api marshaling & validation * begins work on routing lib * [hack] ignores embedded field in generation * path specific datasource for alerting * align endpoint names with cloud * single route per Alerting config * removes unused routing pkg * regens spec * adds datasource param to ruler/prom route paths * Modifications for supporting migration * Apply suggestions from code review * hack for cleaning circular refs in swagger definition * generates files * minor fixes for prom endpoints * decorate prom apis with required: true where applicable * Revert "generates files" This reverts commit ef7e97558477d79bcad416e043b04dbd04a2c8f7. * removes server autogen * Update imported structs from ngalert * Fix listing rules response * Update github.com/prometheus/common dependency * Update get silence response * Update get silences response * adds ruler validation & backend switching * Fix GET /alertmanager/{DatasourceId}/config/api/v1/alerts response * Distinct gettable and postable grafana receivers * Remove permissions routes * Latest JSON specs * Fix testing routes * inline yaml annotation on apirulenode * yaml test & yamlv3 + comments * Fix yaml annotations for embedded type * Rename DatasourceId path parameter * Implement Backend.String() * backend zero value is a real backend * exports DiscoveryBase * Fix GO initialisms * Silences: Use PostableSilence as the base struct for creating silences * Use type alias instead of struct embedding * More fixes to alertmanager silencing routes * post and spec JSONs * Split rule config to postable/gettable * Fix empty POST /silences payload Recreating the generated JSON specs fixes the issue without further modifications * better yaml unmarshaling for nested yaml docs in cortex-am configs * regens spec * re-adds config.receivers * omitempty to align with prometheus API behavior * Prefix routes with /api * Update Alertmanager models * Make adjustments to follow the Alertmanager API * ruler: add for and annotations to grafana alert (#45) * Modify testing API routes * Fix grafana rule for field type * Move PostableUserConfig validation to this library * Fix PostableUserConfig YAML encoding/decoding * Use common fields for grafana and lotex rules * Add namespace id in GettableGrafanaRule * Apply suggestions from code review * fixup * more changes * Apply suggestions from code review * aligns structure pre merge * fix new imports & tests * updates tooling readme * goimports * lint * more linting!! * revive lint Co-authored-by: Sofia Papagiannaki Co-authored-by: Domas Co-authored-by: Sofia Papagiannaki Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: gotjosh Co-authored-by: David Parrott Co-authored-by: Kyle Brandt --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index ad75044a..98a648de 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -11,7 +11,7 @@ import ( "time" gokit_log "github.com/go-kit/kit/log" - apimodels "github.com/grafana/alerting-api/pkg/api" + apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/pkg/errors" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/nflog" From 8481d9742f730888b76a2516e8bb7cc776708361 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 21 Apr 2021 11:34:42 +0100 Subject: [PATCH 18/85] Alerting: Various fixes for the alerts endpoint (#33182) A set of fixes for the GET alert and groups endpoints. - First, is the fact that the default values where not being for the query params. I've introduced a new method in the Grafana context that allow us to do this. - Second, is the fact that alerts were never being transitioned to active. To my surprise this is actually done by the inhibitor in the pipeline - if an alert is not muted, or inhibited then it's active. - Third, I have added an integration test to cover for regressions. Signed-off-by: Josue Abreu --- pkg/services/ngalert/notifier/alertmanager.go | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 98a648de..985495ff 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -14,6 +14,7 @@ import ( apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/pkg/errors" "github.com/prometheus/alertmanager/dispatch" + "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" "github.com/prometheus/alertmanager/nflog/nflogpb" "github.com/prometheus/alertmanager/notify" @@ -74,13 +75,14 @@ type Alertmanager struct { // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log // silences keeps the track of which notifications we should not fire due to user configuration. - silencer *silence.Silencer - silences *silence.Silences - marker types.Marker - alerts *AlertProvider - route *dispatch.Route - dispatcher *dispatch.Dispatcher - dispatcherWG sync.WaitGroup + silencer *silence.Silencer + silences *silence.Silences + marker types.Marker + alerts *AlertProvider + route *dispatch.Route + dispatcher *dispatch.Dispatcher + inhibitor *inhibit.Inhibitor + wg sync.WaitGroup stageMetrics *notify.Metrics dispatcherMetrics *dispatch.DispatcherMetrics @@ -159,7 +161,11 @@ func (am *Alertmanager) StopAndWait() { if am.dispatcher != nil { am.dispatcher.Stop() } - am.dispatcherWG.Wait() + + if am.inhibitor != nil { + am.inhibitor.Stop() + } + am.wg.Wait() } func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { @@ -260,23 +266,32 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { // Now, let's put together our notification pipeline routingStage := make(notify.RoutingStage, len(integrationsMap)) + am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, gokit_log.NewNopLogger()) am.silencer = silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger()) + + inhibitionStage := notify.NewMuteStage(am.inhibitor) silencingStage := notify.NewMuteStage(am.silencer) for name := range integrationsMap { stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{silencingStage, stage} + routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage} } am.StopAndWait() am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) - am.dispatcherWG.Add(1) + am.wg.Add(1) go func() { - defer am.dispatcherWG.Done() + defer am.wg.Done() am.dispatcher.Run() }() + am.wg.Add(1) + go func() { + defer am.wg.Done() + am.inhibitor.Run() + }() + am.config = rawConfig return nil } From e0bd6d10f54355d57150bbc9d300a3a5c8911638 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Thu, 22 Apr 2021 11:18:25 +0200 Subject: [PATCH 19/85] NGAlert: Consolidate on standard errors package (#33249) * NGAlert: Don't use pkg/errors Signed-off-by: Arve Knudsen * Update pkg/services/ngalert/notifier/alertmanager.go Co-authored-by: Will Browne * Fix logging Signed-off-by: Arve Knudsen Co-authored-by: Will Browne --- pkg/services/ngalert/notifier/alertmanager.go | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 985495ff..b3165a80 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -10,9 +10,10 @@ import ( "sync" "time" + "errors" + gokit_log "github.com/go-kit/kit/log" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" - "github.com/pkg/errors" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" @@ -115,19 +116,19 @@ func (am *Alertmanager) Init() (err error) { nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")), ) if err != nil { - return errors.Wrap(err, "unable to initialize the notification log component of alerting") + return fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } am.silences, err = silence.New(silence.Options{ SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"), Retention: retentionNotificationsAndSilences, }) if err != nil { - return errors.Wrap(err, "unable to initialize the silencing component of alerting") + return fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } am.alerts, err = NewAlertProvider(am.marker) if err != nil { - return errors.Wrap(err, "unable to initialize the alert provider component of alerting") + return fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } return nil @@ -136,7 +137,7 @@ func (am *Alertmanager) Init() (err error) { func (am *Alertmanager) Run(ctx context.Context) error { // Make sure dispatcher starts. We can tolerate future reload failures. if err := am.SyncAndApplyConfigFromDatabase(); err != nil { - am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) + am.logger.Error("unable to sync configuration", "err", err) } for { @@ -146,7 +147,7 @@ func (am *Alertmanager) Run(ctx context.Context) error { return nil case <-time.After(pollInterval): if err := am.SyncAndApplyConfigFromDatabase(); err != nil { - am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error()) + am.logger.Error("unable to sync configuration", "err", err) } } } @@ -171,7 +172,7 @@ func (am *Alertmanager) StopAndWait() { func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { rawConfig, err := json.Marshal(&cfg) if err != nil { - return errors.Wrap(err, "failed to serialize to the Alertmanager configuration") + return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err) } am.reloadConfigMtx.Lock() @@ -183,10 +184,13 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er } if err := am.Store.SaveAlertmanagerConfiguration(cmd); err != nil { - return errors.Wrap(err, "failed to save Alertmanager configuration") + return fmt.Errorf("failed to save Alertmanager configuration: %w", err) + } + if err := am.applyConfig(cfg); err != nil { + return fmt.Errorf("unable to reload configuration: %w", err) } - return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration") + return nil } // SyncAndApplyConfigFromDatabase picks the latest config from database and restarts @@ -202,7 +206,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration} } else { - return errors.Wrap(err, "unable to get Alertmanager configuration from the database") + return fmt.Errorf("unable to get Alertmanager configuration from the database: %w", err) } } @@ -211,7 +215,11 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return err } - return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration") + if err := am.applyConfig(cfg); err != nil { + return fmt.Errorf("unable to reload configuration: %w", err) + } + + return nil } // ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. From da5ede584ec87afab6038b0913edacf5cf3ee067 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 22 Apr 2021 19:31:55 +0530 Subject: [PATCH 20/85] AlertingNG: Fix TODOs in email notification channel (#33169) * AlertingNG: Fix TODOs in email notification channel Signed-off-by: Ganesh Vernekar * Test fixup * Remove the receiver field it is not needed for the email notification Co-authored-by: Josue Abreu --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index b3165a80..d5374d7e 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -352,7 +352,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp } switch r.Type { case "email": - n, err = channels.NewEmailNotifier(cfg, externalURL) + n, err = channels.NewEmailNotifier(cfg, externalURL, am.Settings.AppURL) case "pagerduty": n, err = channels.NewPagerdutyNotifier(cfg, tmpl, externalURL) case "slack": From 17d0c98b2517b2c2c7dbf86c2d6425fdd65f14f3 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 22 Apr 2021 20:42:18 +0530 Subject: [PATCH 21/85] AlertingNG: Correctly set StartsAt, EndsAt, UpdatedAt after alert reception (#33109) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 100 +++++++++++++++++- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index d5374d7e..f95602da 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -13,22 +13,26 @@ import ( "errors" gokit_log "github.com/go-kit/kit/log" - apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" + amv2 "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" "github.com/prometheus/alertmanager/nflog/nflogpb" "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/provider" + "github.com/prometheus/alertmanager/provider/mem" "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" "github.com/grafana/grafana/pkg/components/securejsondata" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/models" "github.com/grafana/grafana/pkg/registry" "github.com/grafana/grafana/pkg/services/alerting" + apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" @@ -42,6 +46,11 @@ const ( workingDir = "alerting" // How long should we keep silences and notification entries on-disk after they've served their purpose. retentionNotificationsAndSilences = 5 * 24 * time.Hour + // defaultResolveTimeout is the default timeout used for resolving an alert + // if the end time is not specified. + defaultResolveTimeout = 5 * time.Minute + // memoryAlertsGCInterval is the interval at which we'll remove resolved alerts from memory. + memoryAlertsGCInterval = 30 * time.Minute // To start, the alertmanager needs at least one route defined. // TODO: we should move this to Grafana settings and define this as the default. alertmanagerDefaultConfiguration = ` @@ -76,10 +85,11 @@ type Alertmanager struct { // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log // silences keeps the track of which notifications we should not fire due to user configuration. + silencer *silence.Silencer silences *silence.Silences marker types.Marker - alerts *AlertProvider + alerts provider.Alerts route *dispatch.Route dispatcher *dispatch.Dispatcher inhibitor *inhibit.Inhibitor @@ -126,7 +136,7 @@ func (am *Alertmanager) Init() (err error) { return fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } - am.alerts, err = NewAlertProvider(am.marker) + am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, gokit_log.NewNopLogger()) if err != nil { return fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } @@ -368,8 +378,88 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp } // PutAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not -func (am *Alertmanager) PutAlerts(alerts apimodels.PostableAlerts) error { - return am.alerts.PutPostableAlert(alerts) +func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error { + now := time.Now() + alerts := make([]*types.Alert, 0, len(postableAlerts.PostableAlerts)) + var validationErr *AlertValidationError + for _, a := range postableAlerts.PostableAlerts { + alert := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{}, + Annotations: model.LabelSet{}, + StartsAt: time.Time(a.StartsAt), + EndsAt: time.Time(a.EndsAt), + GeneratorURL: a.GeneratorURL.String(), + }, + UpdatedAt: now, + } + for k, v := range a.Labels { + if len(v) == 0 { // Skip empty labels. + continue + } + alert.Alert.Labels[model.LabelName(k)] = model.LabelValue(v) + } + for k, v := range a.Annotations { + if len(v) == 0 { // Skip empty annotation. + continue + } + alert.Alert.Annotations[model.LabelName(k)] = model.LabelValue(v) + } + + // Ensure StartsAt is set. + if alert.StartsAt.IsZero() { + if alert.EndsAt.IsZero() { + alert.StartsAt = now + } else { + alert.StartsAt = alert.EndsAt + } + } + // If no end time is defined, set a timeout after which an alert + // is marked resolved if it is not updated. + if alert.EndsAt.IsZero() { + alert.Timeout = true + alert.EndsAt = now.Add(defaultResolveTimeout) + } + + if err := alert.Validate(); err != nil { + if validationErr == nil { + validationErr = &AlertValidationError{} + } + validationErr.Alerts = append(validationErr.Alerts, a) + validationErr.Errors = append(validationErr.Errors, err) + continue + } + + alerts = append(alerts, alert) + } + + if err := am.alerts.Put(alerts...); err != nil { + // Notification sending alert takes precedence over validation errors. + return err + } + if validationErr != nil { + // Even if validationErr is nil, the require.NoError fails on it. + return validationErr + } + return nil +} + +// AlertValidationError is the error capturing the validation errors +// faced on the alerts. +type AlertValidationError struct { + Alerts []amv2.PostableAlert + Errors []error // Errors[i] refers to Alerts[i]. +} + +func (e AlertValidationError) Error() string { + errMsg := "" + if len(e.Errors) != 0 { + errMsg := e.Errors[0].Error() + for _, e := range e.Errors[1:] { + errMsg += ";" + e.Error() + } + } + return errMsg } // createReceiverStage creates a pipeline of stages for a receiver. From 7eea6e5a1a9fc40d3079a71dcd6f430d7eb655cb Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 22 Apr 2021 20:54:59 +0530 Subject: [PATCH 22/85] AlertingNG: Add Telegram notification channel (#32795) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f95602da..45c99f72 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -4,14 +4,13 @@ import ( "context" "crypto/md5" "encoding/json" + "errors" "fmt" "net/url" "path/filepath" "sync" "time" - "errors" - gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/alertmanager/dispatch" @@ -367,6 +366,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewPagerdutyNotifier(cfg, tmpl, externalURL) case "slack": n, err = channels.NewSlackNotifier(cfg, tmpl, externalURL) + case "telegram": + n, err = channels.NewTelegramNotifier(cfg, tmpl, externalURL) } if err != nil { return nil, err From 99956aebeb4f2c88a4c17bcde2cc556e6017bb7e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 22 Apr 2021 21:46:26 +0530 Subject: [PATCH 23/85] AlertingNG: Add Teams notification channel (#32979) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 45c99f72..2577d2c9 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -368,6 +368,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewSlackNotifier(cfg, tmpl, externalURL) case "telegram": n, err = channels.NewTelegramNotifier(cfg, tmpl, externalURL) + case "teams": + n, err = channels.NewTeamsNotifier(cfg, tmpl, externalURL) } if err != nil { return nil, err From eea8d84b366e55a74e89329f32ddd06452aea329 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 22 Apr 2021 23:00:49 +0530 Subject: [PATCH 24/85] AlertingNG: Add Dingding notification channel (#32995) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 2577d2c9..55e76356 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -370,6 +370,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewTelegramNotifier(cfg, tmpl, externalURL) case "teams": n, err = channels.NewTeamsNotifier(cfg, tmpl, externalURL) + case "dingding": + n, err = channels.NewDingDingNotifier(cfg, tmpl, externalURL) } if err != nil { return nil, err From ff18b863f472f00d6674f4644bf262bed6dc9dca Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Fri, 23 Apr 2021 18:59:28 +0530 Subject: [PATCH 25/85] AlertingNG: Add webhook notification channel (#33229) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 55e76356..84128ee4 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -274,6 +274,11 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { if err != nil { return err } + externalURL, err := url.Parse(am.Settings.AppURL) + if err != nil { + return err + } + tmpl.ExternalURL = externalURL // Finally, build the integrations map using the receiver configuration and templates. integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl) @@ -355,23 +360,21 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n NotificationChannel err error ) - externalURL, err := url.Parse(am.Settings.AppURL) - if err != nil { - return nil, err - } switch r.Type { case "email": - n, err = channels.NewEmailNotifier(cfg, externalURL, am.Settings.AppURL) + n, err = channels.NewEmailNotifier(cfg, tmpl.ExternalURL) // Email notifier already has a default template. case "pagerduty": - n, err = channels.NewPagerdutyNotifier(cfg, tmpl, externalURL) + n, err = channels.NewPagerdutyNotifier(cfg, tmpl) case "slack": - n, err = channels.NewSlackNotifier(cfg, tmpl, externalURL) + n, err = channels.NewSlackNotifier(cfg, tmpl) case "telegram": - n, err = channels.NewTelegramNotifier(cfg, tmpl, externalURL) + n, err = channels.NewTelegramNotifier(cfg, tmpl) case "teams": - n, err = channels.NewTeamsNotifier(cfg, tmpl, externalURL) + n, err = channels.NewTeamsNotifier(cfg, tmpl) case "dingding": - n, err = channels.NewDingDingNotifier(cfg, tmpl, externalURL) + n, err = channels.NewDingDingNotifier(cfg, tmpl) + case "webhook": + n, err = channels.NewWebHookNotifier(cfg, tmpl) } if err != nil { return nil, err From 89546983e1a4521b35290e287fe13b6ecc419e86 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Fri, 23 Apr 2021 19:49:03 +0530 Subject: [PATCH 26/85] NGAlert: Run the maintenance cycle for the silences (#33301) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 84128ee4..44e62184 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -18,7 +18,6 @@ import ( "github.com/prometheus/alertmanager/nflog" "github.com/prometheus/alertmanager/nflog/nflogpb" "github.com/prometheus/alertmanager/notify" - "github.com/prometheus/alertmanager/provider" "github.com/prometheus/alertmanager/provider/mem" "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" @@ -45,6 +44,8 @@ const ( workingDir = "alerting" // How long should we keep silences and notification entries on-disk after they've served their purpose. retentionNotificationsAndSilences = 5 * 24 * time.Hour + // maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences + maintenanceNotificationAndSilences = 15 * time.Minute // defaultResolveTimeout is the default timeout used for resolving an alert // if the end time is not specified. defaultResolveTimeout = 5 * time.Minute @@ -81,18 +82,21 @@ type Alertmanager struct { SQLStore *sqlstore.SQLStore `inject:""` Store store.AlertingStore - // notificationLog keeps tracks of which notifications we've fired already. notificationLog *nflog.Log - // silences keeps the track of which notifications we should not fire due to user configuration. + marker types.Marker + alerts *mem.Alerts + route *dispatch.Route - silencer *silence.Silencer - silences *silence.Silences - marker types.Marker - alerts provider.Alerts - route *dispatch.Route dispatcher *dispatch.Dispatcher inhibitor *inhibit.Inhibitor - wg sync.WaitGroup + // wg is for dispatcher, inhibitor, silences and notifications + // Across configuration changes dispatcher and inhibitor are completely replaced, however, silences, notification log and alerts remain the same. + // stopc is used to let silences and notifications know we are done. + wg sync.WaitGroup + stopc chan struct{} + + silencer *silence.Silencer + silences *silence.Silences stageMetrics *notify.Metrics dispatcherMetrics *dispatch.DispatcherMetrics @@ -113,6 +117,7 @@ func (am *Alertmanager) IsDisabled() bool { } func (am *Alertmanager) Init() (err error) { + am.stopc = make(chan struct{}) am.logger = log.New("alertmanager") r := prometheus.NewRegistry() am.marker = types.NewMarker(r) @@ -120,13 +125,17 @@ func (am *Alertmanager) Init() (err error) { am.dispatcherMetrics = dispatch.NewDispatcherMetrics(r) am.Store = store.DBstore{SQLStore: am.SQLStore} + // Initialize the notification log + am.wg.Add(1) am.notificationLog, err = nflog.New( nflog.WithRetention(retentionNotificationsAndSilences), nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")), + nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done), ) if err != nil { return fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } + // Initialize silences am.silences, err = silence.New(silence.Options{ SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"), Retention: retentionNotificationsAndSilences, @@ -135,6 +144,13 @@ func (am *Alertmanager) Init() (err error) { return fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } + am.wg.Add(1) + go func() { + am.silences.Maintenance(15*time.Minute, filepath.Join(am.WorkingDirPath(), "silences"), am.stopc) + am.wg.Done() + }() + + // Initialize in-memory alerts am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, gokit_log.NewNopLogger()) if err != nil { return fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) @@ -152,8 +168,7 @@ func (am *Alertmanager) Run(ctx context.Context) error { for { select { case <-ctx.Done(): - am.StopAndWait() - return nil + return am.StopAndWait() case <-time.After(pollInterval): if err := am.SyncAndApplyConfigFromDatabase(); err != nil { am.logger.Error("unable to sync configuration", "err", err) @@ -167,7 +182,7 @@ func (am *Alertmanager) AddMigration(mg *migrator.Migrator) { alertmanagerConfigurationMigration(mg) } -func (am *Alertmanager) StopAndWait() { +func (am *Alertmanager) StopAndWait() error { if am.dispatcher != nil { am.dispatcher.Stop() } @@ -175,7 +190,13 @@ func (am *Alertmanager) StopAndWait() { if am.inhibitor != nil { am.inhibitor.Stop() } + + am.alerts.Close() + + close(am.stopc) + am.wg.Wait() + return nil } func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { @@ -288,6 +309,13 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { // Now, let's put together our notification pipeline routingStage := make(notify.RoutingStage, len(integrationsMap)) + if am.inhibitor != nil { + am.inhibitor.Stop() + } + if am.dispatcher != nil { + am.dispatcher.Stop() + } + am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, gokit_log.NewNopLogger()) am.silencer = silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger()) @@ -298,7 +326,6 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage} } - am.StopAndWait() am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) From d2c88e562638956e606e35c48023b8965de2e4a2 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Fri, 30 Apr 2021 12:28:06 -0400 Subject: [PATCH 27/85] Alerting/metrics (#33547) * moves alerting metrics to their own pkg * adds grafana_alerting_alerts (by state) metric * alerts_received_{total,invalid} * embed alertmanager alerting struct in ng metrics & remove duplicated notification metrics (already embed alertmanager notifier metrics) * use silence metrics from alertmanager lib * fix - manager has metrics * updates ngalert tests * comment lint Signed-off-by: Owen Diehl * cleaner prom registry code * removes ngalert global metrics * new registry use in all tests * ngalert metrics impl service, hack testinfra code to prevent duplicate metric registrations * nilmetrics unexported --- pkg/services/ngalert/notifier/alertmanager.go | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 44e62184..03b6be2f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -22,7 +22,6 @@ import ( "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/grafana/grafana/pkg/components/securejsondata" @@ -31,6 +30,7 @@ import ( "github.com/grafana/grafana/pkg/registry" "github.com/grafana/grafana/pkg/services/alerting" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" + "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" @@ -81,6 +81,7 @@ type Alertmanager struct { Settings *setting.Cfg `inject:""` SQLStore *sqlstore.SQLStore `inject:""` Store store.AlertingStore + Metrics *metrics.Metrics `inject:""` notificationLog *nflog.Log marker types.Marker @@ -116,13 +117,19 @@ func (am *Alertmanager) IsDisabled() bool { return !am.Settings.IsNgAlertEnabled() } -func (am *Alertmanager) Init() (err error) { +func (am *Alertmanager) Init() error { + return am.InitWithMetrics(am.Metrics) +} + +// InitWithMetrics uses the supplied metrics for instantiation and +// allows testware to circumvent duplicate registration errors. +func (am *Alertmanager) InitWithMetrics(m *metrics.Metrics) (err error) { am.stopc = make(chan struct{}) am.logger = log.New("alertmanager") - r := prometheus.NewRegistry() - am.marker = types.NewMarker(r) - am.stageMetrics = notify.NewMetrics(r) - am.dispatcherMetrics = dispatch.NewDispatcherMetrics(r) + am.marker = types.NewMarker(m.Registerer) + am.stageMetrics = notify.NewMetrics(m.Registerer) + am.dispatcherMetrics = dispatch.NewDispatcherMetrics(m.Registerer) + am.Metrics = m am.Store = store.DBstore{SQLStore: am.SQLStore} // Initialize the notification log @@ -137,6 +144,7 @@ func (am *Alertmanager) Init() (err error) { } // Initialize silences am.silences, err = silence.New(silence.Options{ + Metrics: m.Registerer, SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"), Retention: retentionNotificationsAndSilences, }) @@ -456,12 +464,19 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error alert.EndsAt = now.Add(defaultResolveTimeout) } + if alert.EndsAt.After(now) { + am.Metrics.Firing().Inc() + } else { + am.Metrics.Resolved().Inc() + } + if err := alert.Validate(); err != nil { if validationErr == nil { validationErr = &AlertValidationError{} } validationErr.Alerts = append(validationErr.Alerts, a) validationErr.Errors = append(validationErr.Errors, err) + am.Metrics.Invalid().Inc() continue } From ec8c836a4d6e8ab549abb22070643887ced48374 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Tue, 4 May 2021 17:28:39 +0530 Subject: [PATCH 28/85] NGAlert: Send list of available ngalert notification channels via API (#33489) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 03b6be2f..dd25bee7 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -28,7 +28,6 @@ import ( "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/models" "github.com/grafana/grafana/pkg/registry" - "github.com/grafana/grafana/pkg/services/alerting" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -543,29 +542,3 @@ func timeoutFunc(d time.Duration) time.Duration { } return d + waitFunc() } - -// GetAvailableNotifiers returns the metadata of all the notification channels that can be configured. -func (am *Alertmanager) GetAvailableNotifiers() []*alerting.NotifierPlugin { - return []*alerting.NotifierPlugin{ - { - Type: "email", - Name: "Email", - Description: "Sends notifications using Grafana server configured SMTP settings", - Heading: "Email settings", - Options: []alerting.NotifierOption{ - { - Label: "Single email", - Description: "Send a single email to all recipients", - Element: alerting.ElementTypeCheckbox, - PropertyName: "singleEmail", - }, { - Label: "Addresses", - Description: "You can enter multiple email addresses using a \";\" separator", - Element: alerting.ElementTypeTextArea, - PropertyName: "addresses", - Required: true, - }, - }, - }, - } -} From 2078cd0a64155add9da55aa5bbb3fd1946dcaebd Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Mon, 10 May 2021 15:30:42 +0300 Subject: [PATCH 29/85] [Alerting]: store encrypted receiver secure settings (#33832) * [Alerting]: Store secure settings encrypted * Move encryption to the API handler --- pkg/services/ngalert/notifier/alertmanager.go | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index dd25bee7..85d5bb22 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -3,6 +3,7 @@ package notifier import ( "context" "crypto/md5" + "encoding/base64" "encoding/json" "errors" "fmt" @@ -223,7 +224,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er if err := am.Store.SaveAlertmanagerConfiguration(cmd); err != nil { return fmt.Errorf("failed to save Alertmanager configuration: %w", err) } - if err := am.applyConfig(cfg); err != nil { + if err := am.applyConfig(cfg, rawConfig); err != nil { return fmt.Errorf("unable to reload configuration: %w", err) } @@ -252,32 +253,27 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return err } - if err := am.applyConfig(cfg); err != nil { + if err := am.applyConfig(cfg, nil); err != nil { return fmt.Errorf("unable to reload configuration: %w", err) } return nil } -// ApplyConfig applies a new configuration by re-initializing all components using the configuration provided. -func (am *Alertmanager) ApplyConfig(cfg *apimodels.PostableUserConfig) error { - am.reloadConfigMtx.Lock() - defer am.reloadConfigMtx.Unlock() - - return am.applyConfig(cfg) -} - const defaultTemplate = "templates/default.tmpl" // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. -func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error { +func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) error { // First, let's make sure this config is not already loaded var configChanged bool - rawConfig, err := json.Marshal(cfg.AlertmanagerConfig) - if err != nil { - // In theory, this should never happen. - return err + if rawConfig == nil { + enc, err := json.Marshal(cfg.AlertmanagerConfig) + if err != nil { + // In theory, this should never happen. + return err + } + rawConfig = enc } if md5.Sum(am.config) != md5.Sum(rawConfig) { @@ -380,6 +376,16 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp var integrations []notify.Integration for i, r := range receiver.GrafanaManagedReceivers { + // secure settings are already encrypted at this point + secureSettings := securejsondata.SecureJsonData(make(map[string][]byte, len(r.SecureSettings))) + + for k, v := range r.SecureSettings { + d, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return nil, fmt.Errorf("failed to decode secure setting") + } + secureSettings[k] = d + } var ( cfg = &models.AlertNotification{ Uid: r.Uid, @@ -389,7 +395,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp SendReminder: r.SendReminder, DisableResolveMessage: r.DisableResolveMessage, Settings: r.Settings, - SecureSettings: securejsondata.GetEncryptedJsonData(r.SecureSettings), + SecureSettings: secureSettings, } n NotificationChannel err error @@ -409,6 +415,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewDingDingNotifier(cfg, tmpl) case "webhook": n, err = channels.NewWebHookNotifier(cfg, tmpl) + default: + return nil, fmt.Errorf("notifier %s is not supported", r.Type) } if err != nil { return nil, err From 796f005ebb9f32d2666fd172061f136a6e39fc5a Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 12 May 2021 15:13:43 +0530 Subject: [PATCH 30/85] NGAlert: Fix templating and remove unwanted default templates (#33918) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 85d5bb22..dc22575e 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -260,8 +260,6 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return nil } -const defaultTemplate = "templates/default.tmpl" - // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) error { @@ -279,6 +277,12 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig if md5.Sum(am.config) != md5.Sum(rawConfig) { configChanged = true } + + if cfg.TemplateFiles == nil { + cfg.TemplateFiles = map[string]string{} + } + cfg.TemplateFiles["__default__.tmpl"] = channels.DefaultTemplateString + // next, we need to make sure we persist the templates to disk. paths, templatesChanged, err := PersistTemplates(cfg, am.WorkingDirPath()) if err != nil { @@ -291,8 +295,6 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig return nil } - paths = append([]string{defaultTemplate}, paths...) - // With the templates persisted, create the template list using the paths. tmpl, err := template.FromGlobs(paths...) if err != nil { From 82ae34bcc1d6c72097361730906101b8aec27ce7 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 13 May 2021 22:58:19 +0530 Subject: [PATCH 31/85] NGAlert: Add integration tests for notification channels (#33431) * NGAlert: Add integration tests for notification channels Signed-off-by: Ganesh Vernekar * Fix the failing tests Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar * Override creation of rule UID, remove only namespace UID Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index dc22575e..77e53d49 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -446,7 +446,7 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error UpdatedAt: now, } for k, v := range a.Labels { - if len(v) == 0 { // Skip empty labels. + if len(v) == 0 || k == ngmodels.NamespaceUIDLabel { // Skip empty and namespace UID labels. continue } alert.Alert.Labels[model.LabelName(k)] = model.LabelValue(v) From e33289e05fc11199d2e86f9527c5b689aba15ee7 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Thu, 13 May 2021 14:01:38 -0400 Subject: [PATCH 32/85] extracts alertmanager from DI, including migrations (#34071) * extracts alertmanager from DI, including migrations * includes alertmanager Run method in ngalert * removes 3s test shutdown timeout * lint --- pkg/services/ngalert/notifier/alertmanager.go | 50 ++++++------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 77e53d49..5c2dad8b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -28,14 +28,12 @@ import ( "github.com/grafana/grafana/pkg/components/securejsondata" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/models" - "github.com/grafana/grafana/pkg/registry" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/services/sqlstore" - "github.com/grafana/grafana/pkg/services/sqlstore/migrator" "github.com/grafana/grafana/pkg/setting" ) @@ -106,41 +104,28 @@ type Alertmanager struct { config []byte } -func init() { - registry.RegisterService(&Alertmanager{}) -} - -func (am *Alertmanager) IsDisabled() bool { - if am.Settings == nil { - return true +func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { + am := &Alertmanager{ + Settings: cfg, + stopc: make(chan struct{}), + logger: log.New("alertmanager"), + marker: types.NewMarker(m.Registerer), + stageMetrics: notify.NewMetrics(m.Registerer), + dispatcherMetrics: dispatch.NewDispatcherMetrics(m.Registerer), + Store: store, + Metrics: m, } - return !am.Settings.IsNgAlertEnabled() -} - -func (am *Alertmanager) Init() error { - return am.InitWithMetrics(am.Metrics) -} - -// InitWithMetrics uses the supplied metrics for instantiation and -// allows testware to circumvent duplicate registration errors. -func (am *Alertmanager) InitWithMetrics(m *metrics.Metrics) (err error) { - am.stopc = make(chan struct{}) - am.logger = log.New("alertmanager") - am.marker = types.NewMarker(m.Registerer) - am.stageMetrics = notify.NewMetrics(m.Registerer) - am.dispatcherMetrics = dispatch.NewDispatcherMetrics(m.Registerer) - am.Metrics = m - am.Store = store.DBstore{SQLStore: am.SQLStore} // Initialize the notification log am.wg.Add(1) + var err error am.notificationLog, err = nflog.New( nflog.WithRetention(retentionNotificationsAndSilences), nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")), nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done), ) if err != nil { - return fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) + return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } // Initialize silences am.silences, err = silence.New(silence.Options{ @@ -149,7 +134,7 @@ func (am *Alertmanager) InitWithMetrics(m *metrics.Metrics) (err error) { Retention: retentionNotificationsAndSilences, }) if err != nil { - return fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) + return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } am.wg.Add(1) @@ -161,10 +146,10 @@ func (am *Alertmanager) InitWithMetrics(m *metrics.Metrics) (err error) { // Initialize in-memory alerts am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, gokit_log.NewNopLogger()) if err != nil { - return fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) + return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } - return nil + return am, nil } func (am *Alertmanager) Run(ctx context.Context) error { @@ -185,11 +170,6 @@ func (am *Alertmanager) Run(ctx context.Context) error { } } -// AddMigration runs the database migrations as the service starts. -func (am *Alertmanager) AddMigration(mg *migrator.Migrator) { - alertmanagerConfigurationMigration(mg) -} - func (am *Alertmanager) StopAndWait() error { if am.dispatcher != nil { am.dispatcher.Stop() From 648760fbed44cbf828871c1f985886afb304e4ad Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 14 May 2021 19:49:54 +0100 Subject: [PATCH 33/85] Alerting: Modify configuration apply and save semantics - v2 (#34143) * Save default configuration to the database and copy over secure settings --- pkg/services/ngalert/notifier/alertmanager.go | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 5c2dad8b..abf06eaa 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -187,6 +187,8 @@ func (am *Alertmanager) StopAndWait() error { return nil } +// SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. +// It rollbacks the save if we fail to apply the configuration. func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { rawConfig, err := json.Marshal(&cfg) if err != nil { @@ -201,11 +203,14 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), } - if err := am.Store.SaveAlertmanagerConfiguration(cmd); err != nil { - return fmt.Errorf("failed to save Alertmanager configuration: %w", err) - } - if err := am.applyConfig(cfg, rawConfig); err != nil { - return fmt.Errorf("unable to reload configuration: %w", err) + err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { + if err := am.applyConfig(cfg, rawConfig); err != nil { + return err + } + return nil + }) + if err != nil { + return err } return nil @@ -222,7 +227,18 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { // If there's no configuration in the database, let's use the default configuration. if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { - q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration} + // First, let's save it to the database. We don't need to use a transaction here as we'll always succeed. + am.logger.Info("no Alertmanager configuration found, saving and applying a default") + savecmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ + AlertmanagerConfiguration: alertmanagerDefaultConfiguration, + Default: true, + ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + } + if err := am.Store.SaveAlertmanagerConfiguration(savecmd); err != nil { + return err + } + + q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true} } else { return fmt.Errorf("unable to get Alertmanager configuration from the database: %w", err) } From 230e54a9d9118433e02ad0ca99be66d862148c7b Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Fri, 14 May 2021 16:13:44 -0400 Subject: [PATCH 34/85] Alerting/ruler metrics (#34144) * adds active configurations metric * rule evaluation metrics * ruler metrics * pr feedback --- pkg/services/ngalert/notifier/alertmanager.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index abf06eaa..87106302 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -212,6 +212,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er if err != nil { return err } + am.Metrics.ActiveConfigurations.Set(1) return nil } @@ -253,6 +254,12 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return fmt.Errorf("unable to reload configuration: %w", err) } + if q.Result.Default { + am.Metrics.ActiveConfigurations.Set(0) + } else { + am.Metrics.ActiveConfigurations.Set(1) + } + return nil } From 1acb6a4ebc3f628d5cf02cc260027967105bab82 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 17 May 2021 16:05:09 +0530 Subject: [PATCH 35/85] NGAlert: Add message field to email notification channel (#34044) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 87106302..3392bb8b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -407,7 +407,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp ) switch r.Type { case "email": - n, err = channels.NewEmailNotifier(cfg, tmpl.ExternalURL) // Email notifier already has a default template. + n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. case "pagerduty": n, err = channels.NewPagerdutyNotifier(cfg, tmpl) case "slack": From 25dbb85aa29bf1f7ba6f7011ab0dd97283308adb Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 17 May 2021 18:06:47 +0100 Subject: [PATCH 36/85] Alerting: Allow the notifier to log (#34232) * Alerting: Allow the notifier to log The notifier upstream code uses go-kit as its logging library. The grafana specific logger is not compatible with this API. In this PR, I have created a wrapper that implements io.Writer to make them compatible. --- pkg/services/ngalert/notifier/alertmanager.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 3392bb8b..e83b06d6 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -29,6 +29,7 @@ import ( "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/models" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" + "github.com/grafana/grafana/pkg/services/ngalert/logging" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" @@ -75,7 +76,9 @@ const ( ) type Alertmanager struct { - logger log.Logger + logger log.Logger + gokitLogger gokit_log.Logger + Settings *setting.Cfg `inject:""` SQLStore *sqlstore.SQLStore `inject:""` Store store.AlertingStore @@ -116,6 +119,8 @@ func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Aler Metrics: m, } + am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)) + // Initialize the notification log am.wg.Add(1) var err error @@ -144,7 +149,7 @@ func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Aler }() // Initialize in-memory alerts - am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, gokit_log.NewNopLogger()) + am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, am.gokitLogger) if err != nil { return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } @@ -324,8 +329,8 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig am.dispatcher.Stop() } - am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, gokit_log.NewNopLogger()) - am.silencer = silence.NewSilencer(am.silences, am.marker, gokit_log.NewNopLogger()) + am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger) + am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger) inhibitionStage := notify.NewMuteStage(am.inhibitor) silencingStage := notify.NewMuteStage(am.silencer) @@ -335,7 +340,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, gokit_log.NewNopLogger(), am.dispatcherMetrics) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, am.gokitLogger, am.dispatcherMetrics) am.wg.Add(1) go func() { From 2bb87e97f0e5da5a02db4e2121d06a0676e8f219 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Tue, 18 May 2021 13:34:47 +0530 Subject: [PATCH 37/85] NGAlert: Remove unwanted fields from notification channel config (#34036) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index e83b06d6..78b35bf6 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -27,7 +27,6 @@ import ( "github.com/grafana/grafana/pkg/components/securejsondata" "github.com/grafana/grafana/pkg/infra/log" - "github.com/grafana/grafana/pkg/models" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/logging" "github.com/grafana/grafana/pkg/services/ngalert/metrics" @@ -397,12 +396,10 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp secureSettings[k] = d } var ( - cfg = &models.AlertNotification{ - Uid: r.Uid, + cfg = &channels.NotificationChannelConfig{ + UID: r.UID, Name: r.Name, Type: r.Type, - IsDefault: r.IsDefault, - SendReminder: r.SendReminder, DisableResolveMessage: r.DisableResolveMessage, Settings: r.Settings, SecureSettings: secureSettings, From 79e6e33d5a8e35ec0367da69b368c0fdc52fe04b Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Tue, 18 May 2021 17:31:51 +0300 Subject: [PATCH 38/85] [Alerting]: Add Sensu Go integration with the alert manager (#34045) * [Alerting]: Add sensugo notification channel * Apply suggestions from code review Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * Do not include labels with concatenated rule UID and names * Modifications after syncing with main Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 78b35bf6..5a12a43f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -422,6 +422,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewDingDingNotifier(cfg, tmpl) case "webhook": n, err = channels.NewWebHookNotifier(cfg, tmpl) + case "sensugo": + n, err = channels.NewSensuGoNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From 1873b9fa6cb2a0934cda1210a89cda6186017d02 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 19 May 2021 15:27:41 +0200 Subject: [PATCH 39/85] Alerting: Migrate Alertmanager notifier (#34304) * Alerting: Port Alertmanager notifier to v8 Signed-off-by: Arve Knudsen --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 5a12a43f..da43bef9 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -424,6 +424,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewWebHookNotifier(cfg, tmpl) case "sensugo": n, err = channels.NewSensuGoNotifier(cfg, tmpl) + case "alertmanager": + n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From 0015669714747acdc1d5a4ae351ce659fca866ed Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Wed, 19 May 2021 17:48:46 +0300 Subject: [PATCH 40/85] [Alerting]: Add Pushover integration with the alert manager (#34371) * [Alerting]: Add Pushover integration with the alert manager * lint * Set boundary only for tests * Remove title field * fix imports --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index da43bef9..405e6393 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -412,6 +412,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. case "pagerduty": n, err = channels.NewPagerdutyNotifier(cfg, tmpl) + case "pushover": + n, err = channels.NewPushoverNotifier(cfg, tmpl) case "slack": n, err = channels.NewSlackNotifier(cfg, tmpl) case "telegram": From 0de70e09ee16f87ef78f315c367a3c409d99fb1f Mon Sep 17 00:00:00 2001 From: David Parrott Date: Wed, 19 May 2021 08:31:55 -0700 Subject: [PATCH 41/85] Add discord notifier channel and test (#34150) * Add discord notifier channel and test * Correct payload * remove print statement * PR feedback and update due to changes in main * Add discord notifier channel and test * Correct payload * remove print statement * PR feedback and update due to changes in main * update constructor and tests * group imports sensibly * Fix lint Signed-off-by: Ganesh Vernekar Co-authored-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 405e6393..4397080b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -426,6 +426,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewWebHookNotifier(cfg, tmpl) case "sensugo": n, err = channels.NewSensuGoNotifier(cfg, tmpl) + case "discord": + n, err = channels.NewDiscordNotifier(cfg, tmpl) case "alertmanager": n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) default: From 08d6c8aebded0f096eb7116360408ed5601ec616 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 19 May 2021 21:54:04 +0530 Subject: [PATCH 42/85] NGAlert: Add GoogleChat notification channel (#34153) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 4397080b..5585e844 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -430,6 +430,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewDiscordNotifier(cfg, tmpl) case "alertmanager": n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) + case "googlechat": + n, err = channels.NewGoogleChatNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From 62a58a0b650d2ee1bd14a4abb60bc221c48bcb49 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 19 May 2021 22:34:48 +0530 Subject: [PATCH 43/85] NGAlert: Add Line notification channel (#34157) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 5585e844..33d61ac3 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -432,6 +432,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) case "googlechat": n, err = channels.NewGoogleChatNotifier(cfg, tmpl) + case "line": + n, err = channels.NewLineNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From b9e4e2bcaceed75509dc8c03c6b5cf4bb5ed989b Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 19 May 2021 23:22:14 +0530 Subject: [PATCH 44/85] NGAlert: Add VictorOps notification channel (#34161) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 33d61ac3..7ca367a0 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -418,6 +418,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewSlackNotifier(cfg, tmpl) case "telegram": n, err = channels.NewTelegramNotifier(cfg, tmpl) + case "victorops": + n, err = channels.NewVictoropsNotifier(cfg, tmpl) case "teams": n, err = channels.NewTeamsNotifier(cfg, tmpl) case "dingding": From 1157d7b4fb7151b3ec5fa7518a77e098435b6e6c Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 19 May 2021 23:32:09 +0530 Subject: [PATCH 45/85] NGAlert: Add Kafka notification channel (#34156) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 7ca367a0..ecc8464f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -424,6 +424,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewTeamsNotifier(cfg, tmpl) case "dingding": n, err = channels.NewDingDingNotifier(cfg, tmpl) + case "kafka": + n, err = channels.NewKafkaNotifier(cfg, tmpl) case "webhook": n, err = channels.NewWebHookNotifier(cfg, tmpl) case "sensugo": From c378bf8afddd7873b556278cf429e21f6932cee0 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 19 May 2021 23:50:52 +0530 Subject: [PATCH 46/85] NGAlert: Add Threema notification channel (#34159) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index ecc8464f..6c89c505 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -438,6 +438,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewGoogleChatNotifier(cfg, tmpl) case "line": n, err = channels.NewLineNotifier(cfg, tmpl) + case "threema": + n, err = channels.NewThreemaNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From b8587305961b141f13fa67f5de3456c7c75badc5 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 20 May 2021 09:12:08 +0100 Subject: [PATCH 47/85] Alerting: Opsgenie notification channel (#34418) * Alerting: Opsgenie notification channel This translate the opsgenie notification channel from the old alerting system to the new alerting system with a few changes: - The tag system has been replaced in favour of annotation. - TBD - TBD Signed-off-by: Josue Abreu * Fix template URL * Bugfig: dont send resolved when autoClose is false Signed-off-by: Ganesh Vernekar * Fix integration tests Signed-off-by: Ganesh Vernekar * Fix URLs in all other channels Signed-off-by: Ganesh Vernekar Co-authored-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 6c89c505..31759307 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -440,6 +440,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewLineNotifier(cfg, tmpl) case "threema": n, err = channels.NewThreemaNotifier(cfg, tmpl) + case "opsgenie": + n, err = channels.NewOpsgenieNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From e5de8f0910432e695e6f75387f43eccb1b28898d Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Mon, 24 May 2021 16:09:29 +0300 Subject: [PATCH 48/85] [Alerting]: alertmanager notifier fixes (#34575) --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 31759307..c286e015 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -442,6 +442,8 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewThreemaNotifier(cfg, tmpl) case "opsgenie": n, err = channels.NewOpsgenieNotifier(cfg, tmpl) + case "prometheus-alertmanager": + n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) default: return nil, fmt.Errorf("notifier %s is not supported", r.Type) } From 673758d7336762394485545d1d713abfce44e1bb Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 26 May 2021 16:33:55 +0530 Subject: [PATCH 49/85] NGAlert: Add integration tests for remaining notification channels (#34662) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c286e015..faf5a8e6 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -432,8 +432,6 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewSensuGoNotifier(cfg, tmpl) case "discord": n, err = channels.NewDiscordNotifier(cfg, tmpl) - case "alertmanager": - n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) case "googlechat": n, err = channels.NewGoogleChatNotifier(cfg, tmpl) case "line": From 3863716c83f453a7a82743ebb2d4e4f7161ff4d8 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Fri, 28 May 2021 12:31:23 -0400 Subject: [PATCH 50/85] alerting: fixes per-receiver metric cardinality (#34915) --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index faf5a8e6..03b9280b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -448,7 +448,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp if err != nil { return nil, err } - integrations = append(integrations, notify.NewIntegration(n, n, r.Name, i)) + integrations = append(integrations, notify.NewIntegration(n, n, r.Type, i)) } return integrations, nil From 0c25bbea71d6ea01eba0cd8d29ffe3463636224f Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 8 Jul 2021 18:26:09 +0530 Subject: [PATCH 51/85] Alerting: Allow space in label and annotation names (#36549) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 57 ++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 03b9280b..85e1e3b4 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -504,7 +504,7 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error am.Metrics.Resolved().Inc() } - if err := alert.Validate(); err != nil { + if err := validateAlert(alert); err != nil { if validationErr == nil { validationErr = &AlertValidationError{} } @@ -528,6 +528,59 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error return nil } +// validateAlert is a.Validate() while additionally allowing +// space for label and annotation names. +func validateAlert(a *types.Alert) error { + if a.StartsAt.IsZero() { + return fmt.Errorf("start time missing") + } + if !a.EndsAt.IsZero() && a.EndsAt.Before(a.StartsAt) { + return fmt.Errorf("start time must be before end time") + } + if err := validateLabelSet(a.Labels); err != nil { + return fmt.Errorf("invalid label set: %s", err) + } + if len(a.Labels) == 0 { + return fmt.Errorf("at least one label pair required") + } + if err := validateLabelSet(a.Annotations); err != nil { + return fmt.Errorf("invalid annotations: %s", err) + } + return nil +} + +// validateLabelSet is ls.Validate() while additionally allowing +// space for label names. +func validateLabelSet(ls model.LabelSet) error { + for ln, lv := range ls { + if !isValidLabelName(ln) { + return fmt.Errorf("invalid name %q", ln) + } + if !lv.IsValid() { + return fmt.Errorf("invalid value %q", lv) + } + } + return nil +} + +// isValidLabelName is ln.IsValid() while additionally allowing spaces. +// The regex for Prometheus data model is ^[a-zA-Z_][a-zA-Z0-9_]*$ +// while we will follow ^[a-zA-Z_][a-zA-Z0-9_ ]*$ +func isValidLabelName(ln model.LabelName) bool { + if len(ln) == 0 { + return false + } + for i, b := range ln { + if !((b >= 'a' && b <= 'z') || + (b >= 'A' && b <= 'Z') || + b == '_' || + (i > 0 && (b == ' ' || (b >= '0' && b <= '9')))) { + return false + } + } + return true +} + // AlertValidationError is the error capturing the validation errors // faced on the alerts. type AlertValidationError struct { @@ -538,7 +591,7 @@ type AlertValidationError struct { func (e AlertValidationError) Error() string { errMsg := "" if len(e.Errors) != 0 { - errMsg := e.Errors[0].Error() + errMsg = e.Errors[0].Error() for _, e := range e.Errors[1:] { errMsg += ";" + e.Error() } From aae817abfac9803fd76bde13be5432a574b7ea05 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 12 Jul 2021 11:15:16 +0530 Subject: [PATCH 52/85] Alerting: Fix potential panic in Alertmanager when starting up (#36562) * Alerting: Fix potential panic in Alertmanager when starting up Signed-off-by: Ganesh Vernekar * Fix reviews Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 85e1e3b4..241e79a8 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -104,6 +104,8 @@ type Alertmanager struct { reloadConfigMtx sync.RWMutex config []byte + + initialised bool } func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { @@ -269,7 +271,15 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. -func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) error { +func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) (err error) { + defer func() { + if err == nil { + // We consider AM as initialised only when the config has been + // applied at least once successfully. Until then, some objects + // can still be nil. + am.initialised = true + } + }() // First, let's make sure this config is not already loaded var configChanged bool if rawConfig == nil { From 25720c790eff7675212e98eb92b6e0462d0b2291 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 12 Jul 2021 18:53:01 +0530 Subject: [PATCH 53/85] Alerting: A better and cleaner way to know if Alertmanager is initialised (#36659) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 241e79a8..a763e9ef 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -104,8 +104,6 @@ type Alertmanager struct { reloadConfigMtx sync.RWMutex config []byte - - initialised bool } func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { @@ -158,6 +156,16 @@ func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Aler return am, nil } +func (am *Alertmanager) Ready() bool { + // We consider AM as ready only when the config has been + // applied at least once successfully. Until then, some objects + // can still be nil. + am.reloadConfigMtx.RLock() + defer am.reloadConfigMtx.RUnlock() + + return len(am.config) > 0 +} + func (am *Alertmanager) Run(ctx context.Context) error { // Make sure dispatcher starts. We can tolerate future reload failures. if err := am.SyncAndApplyConfigFromDatabase(); err != nil { @@ -272,14 +280,6 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) (err error) { - defer func() { - if err == nil { - // We consider AM as initialised only when the config has been - // applied at least once successfully. Until then, some objects - // can still be nil. - am.initialised = true - } - }() // First, let's make sure this config is not already loaded var configChanged bool if rawConfig == nil { From 43a839438a9f45f8af245801f07bee57c5b2a1fa Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Fri, 16 Jul 2021 20:07:31 +0300 Subject: [PATCH 54/85] Alerting: deactivate an Alertmanager configuration (#36794) * Alerting: deactivate an Alertmanager configuration Implement DELETE /api/alertmanager/grafana/config/api/v1/alerts by storing the default configuration which stops existing cnfiguration from being in use. * Apply suggestions from code review --- pkg/services/ngalert/notifier/alertmanager.go | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index a763e9ef..3aced9ec 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -201,6 +201,37 @@ func (am *Alertmanager) StopAndWait() error { return nil } +// SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager. +// It rollbacks the save if we fail to apply the configuration. +func (am *Alertmanager) SaveAndApplyDefaultConfig() error { + am.reloadConfigMtx.Lock() + defer am.reloadConfigMtx.Unlock() + + cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ + AlertmanagerConfiguration: alertmanagerDefaultConfiguration, + Default: true, + ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + } + + cfg, err := Load([]byte(alertmanagerDefaultConfiguration)) + if err != nil { + return err + } + + err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { + if err := am.applyConfig(cfg, []byte(alertmanagerDefaultConfiguration)); err != nil { + return err + } + return nil + }) + if err != nil { + return err + } + am.Metrics.ActiveConfigurations.Set(1) + + return nil +} + // SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { From b5e08ed6fdda6507c6926d040538e8cb9dc6faa3 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Mon, 26 Jul 2021 18:45:09 +0530 Subject: [PATCH 55/85] Alerting: Remove the fixed wait for notification delivery (#37203) Signed-off-by: Ganesh Vernekar --- pkg/services/ngalert/notifier/alertmanager.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 3aced9ec..5ba1d72c 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -661,7 +661,12 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I } func waitFunc() time.Duration { - return setting.AlertingNotificationTimeout + // When it's a single instance, we don't need additional wait. The routing policies will have their own group wait. + // We need >0 wait here in case we have peers to sync the notification state with. 0 wait in that case can result + // in duplicate notifications being sent. + // TODO: we have setting.AlertingNotificationTimeout in legacy settings. Either use that or separate set of config + // for clustering with intuitive name, like "PeerTimeout". + return 0 } func timeoutFunc(d time.Duration) time.Duration { From 90a36426ec334b2ccf65916d2c1a11e7472d5fba Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 6 Aug 2021 13:06:56 +0100 Subject: [PATCH 56/85] Alerting: Send alerts to external Alertmanager(s) (#37298) * Alerting: Send alerts to external Alertmanager(s) Within this PR we're adding support for registering or unregistering sending to a set of external alertmanagers. A few of the things that are going are: - Introduce a new table to hold "admin" (either org or global) configuration we can change at runtime. - A new periodic check that polls for this configuration and adjusts the "senders" accordingly. - Introduces a new concept of "senders" that are responsible for shipping the alerts to the external Alertmanager(s). In a nutshell, this is the Prometheus notifier (the one in charge of sending the alert) mapped to a multi-tenant map. There are a few code movements here and there but those are minor, I tried to keep things intact as much as possible so that we could have an easier diff. --- pkg/services/ngalert/notifier/alertmanager.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 5ba1d72c..afd395c7 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -511,6 +511,7 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error }, UpdatedAt: now, } + for k, v := range a.Labels { if len(v) == 0 || k == ngmodels.NamespaceUIDLabel { // Skip empty and namespace UID labels. continue From cab114cd9d4630b2404418cdd5cfa90ce9a2a05e Mon Sep 17 00:00:00 2001 From: SLAMA <36870081+xy-man@users.noreply.github.com> Date: Wed, 11 Aug 2021 02:59:53 +0900 Subject: [PATCH 57/85] Alerting frontend : fix line notifier (#37744) - Fixes #37425 - change `line` type string to uppercase --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index afd395c7..f68cb3a8 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -475,7 +475,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp n, err = channels.NewDiscordNotifier(cfg, tmpl) case "googlechat": n, err = channels.NewGoogleChatNotifier(cfg, tmpl) - case "line": + case "LINE": n, err = channels.NewLineNotifier(cfg, tmpl) case "threema": n, err = channels.NewThreemaNotifier(cfg, tmpl) From 190fdc9cb07d91b767c88a47e1a2acf57c653f2e Mon Sep 17 00:00:00 2001 From: Sofia Papagiannaki Date: Thu, 12 Aug 2021 16:04:09 +0300 Subject: [PATCH 58/85] Alerting: modify DB table, accessors and migration to restrict org access (#37414) * Alerting: modify table and accessors to limit org access appropriately * Update migration to create multiple Alertmanager configs * Apply suggestions from code review Co-authored-by: gotjosh * replace mg.ClearMigrationEntry() mg.ClearMigrationEntry() would create a new session. This commit introduces a new migration for clearing an entry from migration log for replacing mg.ClearMigrationEntry() so that all dashboard alert migration operations will run inside the same transaction. It adds also `SkipMigrationLog()` in Migrator interface for skipping adding an entry in the migration_log. Co-authored-by: gotjosh --- pkg/services/ngalert/notifier/alertmanager.go | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f68cb3a8..f9f9a41b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -9,6 +9,7 @@ import ( "fmt" "net/url" "path/filepath" + "strconv" "sync" "time" @@ -72,6 +73,8 @@ const ( } } ` + //TODO: temporary until fix org isolation + mainOrgID = 1 ) type Alertmanager struct { @@ -168,7 +171,7 @@ func (am *Alertmanager) Ready() bool { func (am *Alertmanager) Run(ctx context.Context) error { // Make sure dispatcher starts. We can tolerate future reload failures. - if err := am.SyncAndApplyConfigFromDatabase(); err != nil { + if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil { am.logger.Error("unable to sync configuration", "err", err) } @@ -177,7 +180,7 @@ func (am *Alertmanager) Run(ctx context.Context) error { case <-ctx.Done(): return am.StopAndWait() case <-time.After(pollInterval): - if err := am.SyncAndApplyConfigFromDatabase(); err != nil { + if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil { am.logger.Error("unable to sync configuration", "err", err) } } @@ -203,7 +206,7 @@ func (am *Alertmanager) StopAndWait() error { // SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyDefaultConfig() error { +func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() @@ -211,6 +214,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true, ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + OrgID: orgID, } cfg, err := Load([]byte(alertmanagerDefaultConfiguration)) @@ -234,7 +238,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { // SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { +func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableUserConfig) error { rawConfig, err := json.Marshal(&cfg) if err != nil { return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err) @@ -246,6 +250,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ AlertmanagerConfiguration: string(rawConfig), ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + OrgID: orgID, } err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { @@ -264,12 +269,12 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er // SyncAndApplyConfigFromDatabase picks the latest config from database and restarts // the components with the new config. -func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { +func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() // First, let's get the configuration we need from the database. - q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{} + q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: mainOrgID} if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { // If there's no configuration in the database, let's use the default configuration. if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { @@ -279,6 +284,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true, ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), + OrgID: orgID, } if err := am.Store.SaveAlertmanagerConfiguration(savecmd); err != nil { return err @@ -399,7 +405,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } func (am *Alertmanager) WorkingDirPath() string { - return filepath.Join(am.Settings.DataPath, workingDir) + return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(mainOrgID)) } // buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config. From f34ce486e32af1af50d311db44d1d6983b853faf Mon Sep 17 00:00:00 2001 From: George Robinson <85952834+gerobinson@users.noreply.github.com> Date: Tue, 17 Aug 2021 13:49:05 +0100 Subject: [PATCH 59/85] Contact point testing (#37308) This commit adds contact point testing to ngalerts via a new API endpoint. This endpoint accepts JSON containing a list of receiver configurations which are validated and then tested with a notification for a test alert. The endpoint returns JSON for each receiver with a status and error message. It accepts a configurable timeout via the Request-Timeout header (in seconds) up to a maximum of 30 seconds. --- pkg/services/ngalert/notifier/alertmanager.go | 188 +++++++++++------- 1 file changed, 117 insertions(+), 71 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f9f9a41b..58e828a5 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -106,7 +106,8 @@ type Alertmanager struct { dispatcherMetrics *dispatch.DispatcherMetrics reloadConfigMtx sync.RWMutex - config []byte + config *apimodels.PostableUserConfig + configHash [16]byte } func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { @@ -166,7 +167,11 @@ func (am *Alertmanager) Ready() bool { am.reloadConfigMtx.RLock() defer am.reloadConfigMtx.RUnlock() - return len(am.config) > 0 + return am.ready() +} + +func (am *Alertmanager) ready() bool { + return am.config != nil } func (am *Alertmanager) Run(ctx context.Context) error { @@ -314,6 +319,32 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error { return nil } +func (am *Alertmanager) getTemplate() (*template.Template, error) { + am.reloadConfigMtx.RLock() + defer am.reloadConfigMtx.RUnlock() + if !am.ready() { + return nil, errors.New("alertmanager is not initialized") + } + paths := make([]string, 0, len(am.config.TemplateFiles)) + for name := range am.config.TemplateFiles { + paths = append(paths, filepath.Join(am.WorkingDirPath(), name)) + } + return am.templateFromPaths(paths...) +} + +func (am *Alertmanager) templateFromPaths(paths ...string) (*template.Template, error) { + tmpl, err := template.FromGlobs(paths...) + if err != nil { + return nil, err + } + externalURL, err := url.Parse(am.Settings.AppURL) + if err != nil { + return nil, err + } + tmpl.ExternalURL = externalURL + return tmpl, nil +} + // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) (err error) { @@ -328,7 +359,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig rawConfig = enc } - if md5.Sum(am.config) != md5.Sum(rawConfig) { + if am.configHash != md5.Sum(rawConfig) { configChanged = true } @@ -350,15 +381,10 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } // With the templates persisted, create the template list using the paths. - tmpl, err := template.FromGlobs(paths...) + tmpl, err := am.templateFromPaths(paths...) if err != nil { return err } - externalURL, err := url.Parse(am.Settings.AppURL) - if err != nil { - return err - } - tmpl.ExternalURL = externalURL // Finally, build the integrations map using the receiver configuration and templates. integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl) @@ -400,7 +426,9 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig am.inhibitor.Run() }() - am.config = rawConfig + am.config = cfg + am.configHash = md5.Sum(rawConfig) + return nil } @@ -430,75 +458,93 @@ type NotificationChannel interface { // buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { var integrations []notify.Integration - for i, r := range receiver.GrafanaManagedReceivers { - // secure settings are already encrypted at this point - secureSettings := securejsondata.SecureJsonData(make(map[string][]byte, len(r.SecureSettings))) + n, err := am.buildReceiverIntegration(r, tmpl) + if err != nil { + return nil, err + } + integrations = append(integrations, notify.NewIntegration(n, n, r.Type, i)) + } + return integrations, nil +} - for k, v := range r.SecureSettings { - d, err := base64.StdEncoding.DecodeString(v) - if err != nil { - return nil, fmt.Errorf("failed to decode secure setting") +func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaReceiver, tmpl *template.Template) (NotificationChannel, error) { + // secure settings are already encrypted at this point + secureSettings := securejsondata.SecureJsonData(make(map[string][]byte, len(r.SecureSettings))) + + for k, v := range r.SecureSettings { + d, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return nil, InvalidReceiverError{ + Receiver: r, + Err: errors.New("failed to decode secure setting"), } - secureSettings[k] = d } - var ( - cfg = &channels.NotificationChannelConfig{ - UID: r.UID, - Name: r.Name, - Type: r.Type, - DisableResolveMessage: r.DisableResolveMessage, - Settings: r.Settings, - SecureSettings: secureSettings, - } - n NotificationChannel - err error - ) - switch r.Type { - case "email": - n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. - case "pagerduty": - n, err = channels.NewPagerdutyNotifier(cfg, tmpl) - case "pushover": - n, err = channels.NewPushoverNotifier(cfg, tmpl) - case "slack": - n, err = channels.NewSlackNotifier(cfg, tmpl) - case "telegram": - n, err = channels.NewTelegramNotifier(cfg, tmpl) - case "victorops": - n, err = channels.NewVictoropsNotifier(cfg, tmpl) - case "teams": - n, err = channels.NewTeamsNotifier(cfg, tmpl) - case "dingding": - n, err = channels.NewDingDingNotifier(cfg, tmpl) - case "kafka": - n, err = channels.NewKafkaNotifier(cfg, tmpl) - case "webhook": - n, err = channels.NewWebHookNotifier(cfg, tmpl) - case "sensugo": - n, err = channels.NewSensuGoNotifier(cfg, tmpl) - case "discord": - n, err = channels.NewDiscordNotifier(cfg, tmpl) - case "googlechat": - n, err = channels.NewGoogleChatNotifier(cfg, tmpl) - case "LINE": - n, err = channels.NewLineNotifier(cfg, tmpl) - case "threema": - n, err = channels.NewThreemaNotifier(cfg, tmpl) - case "opsgenie": - n, err = channels.NewOpsgenieNotifier(cfg, tmpl) - case "prometheus-alertmanager": - n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) - default: - return nil, fmt.Errorf("notifier %s is not supported", r.Type) + secureSettings[k] = d + } + + var ( + cfg = &channels.NotificationChannelConfig{ + UID: r.UID, + Name: r.Name, + Type: r.Type, + DisableResolveMessage: r.DisableResolveMessage, + Settings: r.Settings, + SecureSettings: secureSettings, } - if err != nil { - return nil, err + n NotificationChannel + err error + ) + switch r.Type { + case "email": + n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. + case "pagerduty": + n, err = channels.NewPagerdutyNotifier(cfg, tmpl) + case "pushover": + n, err = channels.NewPushoverNotifier(cfg, tmpl) + case "slack": + n, err = channels.NewSlackNotifier(cfg, tmpl) + case "telegram": + n, err = channels.NewTelegramNotifier(cfg, tmpl) + case "victorops": + n, err = channels.NewVictoropsNotifier(cfg, tmpl) + case "teams": + n, err = channels.NewTeamsNotifier(cfg, tmpl) + case "dingding": + n, err = channels.NewDingDingNotifier(cfg, tmpl) + case "kafka": + n, err = channels.NewKafkaNotifier(cfg, tmpl) + case "webhook": + n, err = channels.NewWebHookNotifier(cfg, tmpl) + case "sensugo": + n, err = channels.NewSensuGoNotifier(cfg, tmpl) + case "discord": + n, err = channels.NewDiscordNotifier(cfg, tmpl) + case "googlechat": + n, err = channels.NewGoogleChatNotifier(cfg, tmpl) + case "LINE": + n, err = channels.NewLineNotifier(cfg, tmpl) + case "threema": + n, err = channels.NewThreemaNotifier(cfg, tmpl) + case "opsgenie": + n, err = channels.NewOpsgenieNotifier(cfg, tmpl) + case "prometheus-alertmanager": + n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) + default: + return nil, InvalidReceiverError{ + Receiver: r, + Err: fmt.Errorf("notifier %s is not supported", r.Type), } - integrations = append(integrations, notify.NewIntegration(n, n, r.Type, i)) } - return integrations, nil + if err != nil { + return nil, InvalidReceiverError{ + Receiver: r, + Err: err, + } + } + + return n, nil } // PutAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not From c3db81bbaf33191f564a76d42ef5f16a15f55ac8 Mon Sep 17 00:00:00 2001 From: David Parrott Date: Tue, 24 Aug 2021 03:28:09 -0700 Subject: [PATCH 60/85] Alerting: create wrapper for Alertmanager to enable org level isolation (#37320) Introduces org-level isolation for the Alertmanager and its components. Silences, Alerts and Contact points are not separated by org and are not shared between them. Co-authored with @davidmparrott and @papagian --- pkg/services/ngalert/notifier/alertmanager.go | 56 ++++++------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 58e828a5..da9f29b3 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -34,13 +34,11 @@ import ( ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" - "github.com/grafana/grafana/pkg/services/sqlstore" "github.com/grafana/grafana/pkg/setting" ) const ( - pollInterval = 1 * time.Minute - workingDir = "alerting" + workingDir = "alerting" // How long should we keep silences and notification entries on-disk after they've served their purpose. retentionNotificationsAndSilences = 5 * 24 * time.Hour // maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences @@ -73,18 +71,15 @@ const ( } } ` - //TODO: temporary until fix org isolation - mainOrgID = 1 ) type Alertmanager struct { logger log.Logger gokitLogger gokit_log.Logger - Settings *setting.Cfg `inject:""` - SQLStore *sqlstore.SQLStore `inject:""` + Settings *setting.Cfg Store store.AlertingStore - Metrics *metrics.Metrics `inject:""` + Metrics *metrics.Metrics notificationLog *nflog.Log marker types.Marker @@ -108,18 +103,20 @@ type Alertmanager struct { reloadConfigMtx sync.RWMutex config *apimodels.PostableUserConfig configHash [16]byte + orgID int64 } -func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), - logger: log.New("alertmanager"), + logger: log.New("alertmanager", "org", orgID), marker: types.NewMarker(m.Registerer), stageMetrics: notify.NewMetrics(m.Registerer), dispatcherMetrics: dispatch.NewDispatcherMetrics(m.Registerer), Store: store, Metrics: m, + orgID: orgID, } am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)) @@ -174,25 +171,7 @@ func (am *Alertmanager) ready() bool { return am.config != nil } -func (am *Alertmanager) Run(ctx context.Context) error { - // Make sure dispatcher starts. We can tolerate future reload failures. - if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil { - am.logger.Error("unable to sync configuration", "err", err) - } - - for { - select { - case <-ctx.Done(): - return am.StopAndWait() - case <-time.After(pollInterval): - if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil { - am.logger.Error("unable to sync configuration", "err", err) - } - } - } -} - -func (am *Alertmanager) StopAndWait() error { +func (am *Alertmanager) StopAndWait() { if am.dispatcher != nil { am.dispatcher.Stop() } @@ -206,12 +185,11 @@ func (am *Alertmanager) StopAndWait() error { close(am.stopc) am.wg.Wait() - return nil } // SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error { +func (am *Alertmanager) SaveAndApplyDefaultConfig() error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() @@ -219,7 +197,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error { AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true, ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: orgID, + OrgID: am.orgID, } cfg, err := Load([]byte(alertmanagerDefaultConfiguration)) @@ -243,7 +221,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error { // SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableUserConfig) error { +func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { rawConfig, err := json.Marshal(&cfg) if err != nil { return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err) @@ -255,7 +233,7 @@ func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableU cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ AlertmanagerConfiguration: string(rawConfig), ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: orgID, + OrgID: am.orgID, } err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { @@ -274,12 +252,12 @@ func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableU // SyncAndApplyConfigFromDatabase picks the latest config from database and restarts // the components with the new config. -func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error { +func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() // First, let's get the configuration we need from the database. - q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: mainOrgID} + q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: am.orgID} if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { // If there's no configuration in the database, let's use the default configuration. if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { @@ -289,7 +267,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error { AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true, ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: orgID, + OrgID: am.orgID, } if err := am.Store.SaveAlertmanagerConfiguration(savecmd); err != nil { return err @@ -389,7 +367,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig // Finally, build the integrations map using the receiver configuration and templates. integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl) if err != nil { - return err + return fmt.Errorf("failed to build integration map: %w", err) } // Now, let's put together our notification pipeline routingStage := make(notify.RoutingStage, len(integrationsMap)) @@ -433,7 +411,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } func (am *Alertmanager) WorkingDirPath() string { - return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(mainOrgID)) + return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(int(am.orgID))) } // buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config. From b51d09095df6405426a2149b35dce8b79e3ad9dd Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 9 Sep 2021 17:25:22 +0100 Subject: [PATCH 61/85] Alerting: Persist notification log and silences to the database (#39005) * Alerting: Persist notification log and silences to the database This removes the dependency of having persistent disk to run grafana alerting. Instead of regularly flushing the notification log and silences to disk we now flush the binary content of those files to the database encoded as a base64 string. --- pkg/services/ngalert/notifier/alertmanager.go | 46 ++++++++++++++----- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index da9f29b3..891a2bb9 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -27,6 +27,7 @@ import ( "github.com/prometheus/common/model" "github.com/grafana/grafana/pkg/components/securejsondata" + "github.com/grafana/grafana/pkg/infra/kvstore" "github.com/grafana/grafana/pkg/infra/log" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/logging" @@ -38,6 +39,9 @@ import ( ) const ( + notificationLogFilename = "notifications" + silencesFilename = "silences" + workingDir = "alerting" // How long should we keep silences and notification entries on-disk after they've served their purpose. retentionNotificationsAndSilences = 5 * 24 * time.Hour @@ -77,9 +81,10 @@ type Alertmanager struct { logger log.Logger gokitLogger gokit_log.Logger - Settings *setting.Cfg - Store store.AlertingStore - Metrics *metrics.Metrics + Settings *setting.Cfg + Store store.AlertingStore + fileStore *FileStore + Metrics *metrics.Metrics notificationLog *nflog.Log marker types.Marker @@ -106,28 +111,39 @@ type Alertmanager struct { orgID int64 } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Metrics) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), logger: log.New("alertmanager", "org", orgID), marker: types.NewMarker(m.Registerer), stageMetrics: notify.NewMetrics(m.Registerer), - dispatcherMetrics: dispatch.NewDispatcherMetrics(m.Registerer), + dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), Store: store, Metrics: m, orgID: orgID, } am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)) + am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) + + nflogFilepath, err := am.fileStore.FilepathFor(context.TODO(), notificationLogFilename) + if err != nil { + return nil, err + } + silencesFilePath, err := am.fileStore.FilepathFor(context.TODO(), silencesFilename) + if err != nil { + return nil, err + } // Initialize the notification log am.wg.Add(1) - var err error am.notificationLog, err = nflog.New( nflog.WithRetention(retentionNotificationsAndSilences), - nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")), - nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done), + nflog.WithSnapshot(nflogFilepath), + nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done, func() (int64, error) { + return am.fileStore.Persist(context.TODO(), notificationLogFilename, am.notificationLog) + }), ) if err != nil { return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) @@ -135,7 +151,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m // Initialize silences am.silences, err = silence.New(silence.Options{ Metrics: m.Registerer, - SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"), + SnapshotFile: silencesFilePath, Retention: retentionNotificationsAndSilences, }) if err != nil { @@ -144,12 +160,14 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m am.wg.Add(1) go func() { - am.silences.Maintenance(15*time.Minute, filepath.Join(am.WorkingDirPath(), "silences"), am.stopc) + am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) { + return am.fileStore.Persist(context.TODO(), silencesFilename, am.silences) + }) am.wg.Done() }() // Initialize in-memory alerts - am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, am.gokitLogger) + am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, nil, am.gokitLogger) if err != nil { return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } @@ -390,7 +408,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, am.gokitLogger, am.dispatcherMetrics) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) am.wg.Add(1) go func() { @@ -707,3 +725,7 @@ func timeoutFunc(d time.Duration) time.Duration { } return d + waitFunc() } + +type nilLimits struct{} + +func (n nilLimits) MaxNumberOfAggregationGroups() int { return 0 } From 6cefb7cbc7e81365adef58ad92eb0012af0c09a5 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 14 Sep 2021 12:55:01 +0100 Subject: [PATCH 62/85] Alerting: Refactor & fix unified alerting metrics structure (#39151) * Alerting: Refactor & fix unified alerting metrics structure Fixes and refactors the metrics structure we have for the ngalert service. Now, each component has its own metric struct that includes the JUST the metrics it uses. Additionally, I have fixed the configuration metrics and added new metrics to determine if we have discovered and started all the necessary configurations of an instance. This allows us to alert on `grafana_alerting_discovered_configurations - grafana_alerting_active_configurations != 0` to know whether an alertmanager instance did not start successfully. --- pkg/services/ngalert/notifier/alertmanager.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 891a2bb9..cde9ec5f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -84,7 +84,7 @@ type Alertmanager struct { Settings *setting.Cfg Store store.AlertingStore fileStore *FileStore - Metrics *metrics.Metrics + Metrics *metrics.Alertmanager notificationLog *nflog.Log marker types.Marker @@ -111,7 +111,7 @@ type Alertmanager struct { orgID int64 } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Metrics) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), @@ -232,7 +232,6 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { if err != nil { return err } - am.Metrics.ActiveConfigurations.Set(1) return nil } @@ -263,7 +262,6 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er if err != nil { return err } - am.Metrics.ActiveConfigurations.Set(1) return nil } @@ -306,12 +304,6 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { return fmt.Errorf("unable to reload configuration: %w", err) } - if q.Result.Default { - am.Metrics.ActiveConfigurations.Set(0) - } else { - am.Metrics.ActiveConfigurations.Set(1) - } - return nil } From 9be248674804f1a7680c8a2c6dc6fac79227cfb0 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 16 Sep 2021 15:33:51 +0100 Subject: [PATCH 63/85] Alerting: Support Unified Alerting with Grafana HA (#37920) * Alerting: Support Unified Alerting in Grafana's HA mode. --- pkg/services/ngalert/notifier/alertmanager.go | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index cde9ec5f..d7fec832 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -15,6 +15,7 @@ import ( gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" + "github.com/prometheus/alertmanager/cluster" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" @@ -24,6 +25,7 @@ import ( "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/grafana/grafana/pkg/components/securejsondata" @@ -77,9 +79,16 @@ const ( ` ) +type ClusterPeer interface { + AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel + Position() int + WaitReady(context.Context) error +} + type Alertmanager struct { logger log.Logger gokitLogger gokit_log.Logger + OrgID int64 Settings *setting.Cfg Store store.AlertingStore @@ -90,6 +99,8 @@ type Alertmanager struct { marker types.Marker alerts *mem.Alerts route *dispatch.Route + peer ClusterPeer + peerTimeout time.Duration dispatcher *dispatch.Dispatcher inhibitor *inhibit.Inhibitor @@ -111,7 +122,7 @@ type Alertmanager struct { orgID int64 } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), @@ -120,6 +131,8 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k stageMetrics: notify.NewMetrics(m.Registerer), dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), Store: store, + peer: peer, + peerTimeout: cfg.HAPeerTimeout, Metrics: m, orgID: orgID, } @@ -148,6 +161,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k if err != nil { return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } + c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer) + am.notificationLog.SetBroadcast(c.Broadcast) + // Initialize silences am.silences, err = silence.New(silence.Options{ Metrics: m.Registerer, @@ -158,6 +174,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } + c = am.peer.AddState(fmt.Sprintf("silences:%d", am.OrgID), am.silences, m.Registerer) + am.silences.SetBroadcast(c.Broadcast) + am.wg.Add(1) go func() { am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) { @@ -392,15 +411,16 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger) am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger) + meshStage := notify.NewGossipSettleStage(am.peer) inhibitionStage := notify.NewMuteStage(am.inhibitor) silencingStage := notify.NewMuteStage(am.silencer) for name := range integrationsMap { - stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage} + stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog) + routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage} } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) am.wg.Add(1) go func() { @@ -701,21 +721,17 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I return fs } -func waitFunc() time.Duration { - // When it's a single instance, we don't need additional wait. The routing policies will have their own group wait. - // We need >0 wait here in case we have peers to sync the notification state with. 0 wait in that case can result - // in duplicate notifications being sent. - // TODO: we have setting.AlertingNotificationTimeout in legacy settings. Either use that or separate set of config - // for clustering with intuitive name, like "PeerTimeout". - return 0 +func (am *Alertmanager) waitFunc() time.Duration { + return time.Duration(am.peer.Position()) * am.peerTimeout } -func timeoutFunc(d time.Duration) time.Duration { - //TODO: What does MinTimeout means here? +func (am *Alertmanager) timeoutFunc(d time.Duration) time.Duration { + // time.Duration d relates to the receiver's group_interval. Even with a group interval of 1s, + // we need to make sure (non-position-0) peers in the cluster wait before flushing the notifications. if d < notify.MinTimeout { d = notify.MinTimeout } - return d + waitFunc() + return d + am.waitFunc() } type nilLimits struct{} From 354cf5e5d9cae16d7a365a24770b39ae4e46a44c Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Fri, 17 Sep 2021 14:12:27 -0400 Subject: [PATCH 64/85] Provide reader to alertmanager silence instead of file path (#39305) --- pkg/services/ngalert/notifier/alertmanager.go | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index d7fec832..f393a01b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "net/url" + "os" "path/filepath" "strconv" "sync" @@ -164,12 +165,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer) am.notificationLog.SetBroadcast(c.Broadcast) - // Initialize silences - am.silences, err = silence.New(silence.Options{ - Metrics: m.Registerer, - SnapshotFile: silencesFilePath, - Retention: retentionNotificationsAndSilences, - }) + am.silences, err = newSilences(silencesFilePath, m.Registerer) if err != nil { return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } @@ -194,6 +190,31 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k return am, nil } +// newSilences initializes returns *silence.Silences (from the Alertmanager) with silences taken from the file by path silencesFilePath and specific metrics registerer. +func newSilences(silencesFilePath string, registerer prometheus.Registerer) (*silence.Silences, error) { + //TODO yuriy: Replace with silencesFilePath when fix in https://github.com/prometheus/alertmanager/pull/2710 is merged. + silenceOpts := silence.Options{ + Metrics: registerer, + Retention: retentionNotificationsAndSilences, + } + + //The path is generated by the filestore. So presumably it should be safe + //nolint:gosec + silencesFileReader, err := os.Open(silencesFilePath) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + + if silencesFileReader != nil { + silenceOpts.SnapshotReader = silencesFileReader + defer func(file *os.File) { + _ = file.Close() + }(silencesFileReader) + } + // Initialize silences + return silence.New(silenceOpts) +} + func (am *Alertmanager) Ready() bool { // We consider AM as ready only when the config has been // applied at least once successfully. Until then, some objects From b38ebc020da5a49cf2290eb9b946c45b64c88ae5 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 20 Sep 2021 08:12:21 +0100 Subject: [PATCH 65/85] Alerting: Move the unified alerting settings to its own struct (#39350) --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f393a01b..c135dd43 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -133,7 +133,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), Store: store, peer: peer, - peerTimeout: cfg.HAPeerTimeout, + peerTimeout: cfg.UnifiedAlerting.HAPeerTimeout, Metrics: m, orgID: orgID, } From be9bb0c25f52ef2a677575b92123bfe3c7bdd551 Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Tue, 21 Sep 2021 11:01:23 -0400 Subject: [PATCH 66/85] Alerting: Optimization of fetching data in multiorg alertmanager (#39237) * Add method GetAllLatestAlertmanagerConfiguration to DBStore * add method ApplyConfig to AlertManager * update multiorg alert manager to load all alertmanager configs at once --- pkg/services/ngalert/notifier/alertmanager.go | 44 +++++-------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c135dd43..cbd42c50 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -306,44 +306,20 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er return nil } -// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts -// the components with the new config. -func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error { - am.reloadConfigMtx.Lock() - defer am.reloadConfigMtx.Unlock() - - // First, let's get the configuration we need from the database. - q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: am.orgID} - if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil { - // If there's no configuration in the database, let's use the default configuration. - if errors.Is(err, store.ErrNoAlertmanagerConfiguration) { - // First, let's save it to the database. We don't need to use a transaction here as we'll always succeed. - am.logger.Info("no Alertmanager configuration found, saving and applying a default") - savecmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ - AlertmanagerConfiguration: alertmanagerDefaultConfiguration, - Default: true, - ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: am.orgID, - } - if err := am.Store.SaveAlertmanagerConfiguration(savecmd); err != nil { - return err - } - - q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration, Default: true} - } else { - return fmt.Errorf("unable to get Alertmanager configuration from the database: %w", err) - } - } - - cfg, err := Load([]byte(q.Result.AlertmanagerConfiguration)) +// ApplyConfig applies the configuration to the Alertmanager. +func (am *Alertmanager) ApplyConfig(dbCfg *ngmodels.AlertConfiguration) error { + var err error + cfg, err := Load([]byte(dbCfg.AlertmanagerConfiguration)) if err != nil { - return err + return fmt.Errorf("failed to parse Alertmanager config: %w", err) } - if err := am.applyConfig(cfg, nil); err != nil { - return fmt.Errorf("unable to reload configuration: %w", err) - } + am.reloadConfigMtx.Lock() + defer am.reloadConfigMtx.Unlock() + if err = am.applyConfig(cfg, nil); err != nil { + return fmt.Errorf("unable to apply configuration: %w", err) + } return nil } From 2053ab68f363b3f2fcef19e00d5132db4200c14e Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Thu, 23 Sep 2021 13:52:20 -0400 Subject: [PATCH 67/85] Alerting: Move alertmanager default config to UnifiedAlertingSettings (#39597) --- pkg/services/ngalert/notifier/alertmanager.go | 29 ++----------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index cbd42c50..37794f5c 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -55,29 +55,6 @@ const ( defaultResolveTimeout = 5 * time.Minute // memoryAlertsGCInterval is the interval at which we'll remove resolved alerts from memory. memoryAlertsGCInterval = 30 * time.Minute - // To start, the alertmanager needs at least one route defined. - // TODO: we should move this to Grafana settings and define this as the default. - alertmanagerDefaultConfiguration = ` -{ - "alertmanager_config": { - "route": { - "receiver": "grafana-default-email" - }, - "receivers": [{ - "name": "grafana-default-email", - "grafana_managed_receiver_configs": [{ - "uid": "", - "name": "email receiver", - "type": "email", - "isDefault": true, - "settings": { - "addresses": "" - } - }] - }] - } -} -` ) type ClusterPeer interface { @@ -252,19 +229,19 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { defer am.reloadConfigMtx.Unlock() cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ - AlertmanagerConfiguration: alertmanagerDefaultConfiguration, + AlertmanagerConfiguration: am.Settings.UnifiedAlerting.DefaultConfiguration, Default: true, ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), OrgID: am.orgID, } - cfg, err := Load([]byte(alertmanagerDefaultConfiguration)) + cfg, err := Load([]byte(am.Settings.UnifiedAlerting.DefaultConfiguration)) if err != nil { return err } err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { - if err := am.applyConfig(cfg, []byte(alertmanagerDefaultConfiguration)); err != nil { + if err := am.applyConfig(cfg, []byte(am.Settings.UnifiedAlerting.DefaultConfiguration)); err != nil { return err } return nil From d7c4a2899f39da4a00320e4aee1cb0aaa4091e5b Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Fri, 1 Oct 2021 09:54:37 -0400 Subject: [PATCH 68/85] Alerting: Remove extra field orgId from notifier.Alertmanager (#39870) --- pkg/services/ngalert/notifier/alertmanager.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 37794f5c..e7f5b739 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -66,7 +66,6 @@ type ClusterPeer interface { type Alertmanager struct { logger log.Logger gokitLogger gokit_log.Logger - OrgID int64 Settings *setting.Cfg Store store.AlertingStore @@ -139,7 +138,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k if err != nil { return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } - c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer) + c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.orgID), am.notificationLog, m.Registerer) am.notificationLog.SetBroadcast(c.Broadcast) am.silences, err = newSilences(silencesFilePath, m.Registerer) @@ -147,7 +146,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } - c = am.peer.AddState(fmt.Sprintf("silences:%d", am.OrgID), am.silences, m.Registerer) + c = am.peer.AddState(fmt.Sprintf("silences:%d", am.orgID), am.silences, m.Registerer) am.silences.SetBroadcast(c.Broadcast) am.wg.Add(1) From 5320a7c28ec9fe8ed4666f16ccddaa40128d20eb Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 4 Oct 2021 14:06:40 +0100 Subject: [PATCH 69/85] Alerting: Allow more characters in label names so notifications are sent (#38629) Remove validation for labels to be accepted in the Alertmanager, This helps with datasources that produce non-compatible labels. Adds an "object_matchers" to alert manager routers so we can support labels names with extended characters beyond prometheus/openmetrics. It only does this for the internal Grafana managed Alert Manager. This requires a change to alert manager, so for now we use grafana/alertmanager which is a slight fork, with the intention of going back to upstream. The frontend handles the migration of "matchers" -> "object_matchers" when the route is edited and saved. Once this is done, downgrades will not work old versions will not recognize the "object_matchers". Co-authored-by: Kyle Brandt Co-authored-by: Nathan Rodman --- pkg/services/ngalert/notifier/alertmanager.go | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index e7f5b739..f11b8550 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -10,9 +10,11 @@ import ( "net/url" "os" "path/filepath" + "regexp" "strconv" "sync" "time" + "unicode/utf8" gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" @@ -39,6 +41,7 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/setting" + pb "github.com/prometheus/alertmanager/silence/silencepb" ) const ( @@ -57,6 +60,24 @@ const ( memoryAlertsGCInterval = 30 * time.Minute ) +func init() { + silence.ValidateMatcher = func(m *pb.Matcher) error { + switch m.Type { + case pb.Matcher_EQUAL, pb.Matcher_NOT_EQUAL: + if !model.LabelValue(m.Pattern).IsValid() { + return fmt.Errorf("invalid label value %q", m.Pattern) + } + case pb.Matcher_REGEXP, pb.Matcher_NOT_REGEXP: + if _, err := regexp.Compile(m.Pattern); err != nil { + return fmt.Errorf("invalid regular expression %q: %s", m.Pattern, err) + } + default: + return fmt.Errorf("unknown matcher type %q", m.Type) + } + return nil + } +} + type ClusterPeer interface { AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel Position() int @@ -392,7 +413,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage} } - am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) + am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route.AsAMRoute(), nil) am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) am.wg.Add(1) @@ -638,22 +659,14 @@ func validateLabelSet(ls model.LabelSet) error { return nil } -// isValidLabelName is ln.IsValid() while additionally allowing spaces. -// The regex for Prometheus data model is ^[a-zA-Z_][a-zA-Z0-9_]*$ -// while we will follow ^[a-zA-Z_][a-zA-Z0-9_ ]*$ +// isValidLabelName is ln.IsValid() without restrictions other than it can not be empty. +// The regex for Prometheus data model is ^[a-zA-Z_][a-zA-Z0-9_]*$. func isValidLabelName(ln model.LabelName) bool { if len(ln) == 0 { return false } - for i, b := range ln { - if !((b >= 'a' && b <= 'z') || - (b >= 'A' && b <= 'Z') || - b == '_' || - (i > 0 && (b == ' ' || (b >= '0' && b <= '9')))) { - return false - } - } - return true + + return utf8.ValidString(string(ln)) } // AlertValidationError is the error capturing the validation errors From a19173861f7550c33e8508f022a4d1d180b0bfec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20L=C3=B3pez=20de=20la=20Franca=20Beltran?= <5459617+joanlopez@users.noreply.github.com> Date: Thu, 7 Oct 2021 16:33:50 +0200 Subject: [PATCH 70/85] Encryption: Refactor securejsondata.SecureJsonData to stop relying on global functions (#38865) * Encryption: Add support to encrypt/decrypt sjd * Add datasources.Service as a proxy to datasources db operations * Encrypt ds.SecureJsonData before calling SQLStore * Move ds cache code into ds service * Fix tlsmanager tests * Fix pluginproxy tests * Remove some securejsondata.GetEncryptedJsonData usages * Add pluginsettings.Service as a proxy for plugin settings db operations * Add AlertNotificationService as a proxy for alert notification db operations * Remove some securejsondata.GetEncryptedJsonData usages * Remove more securejsondata.GetEncryptedJsonData usages * Fix lint errors * Minor fixes * Remove encryption global functions usages from ngalert * Fix lint errors * Minor fixes * Minor fixes * Remove securejsondata.DecryptedValue usage * Refactor the refactor * Remove securejsondata.DecryptedValue usage * Move securejsondata to migrations package * Move securejsondata to migrations package * Minor fix * Fix integration test * Fix integration tests * Undo undesired changes * Fix tests * Add context.Context into encryption methods * Fix tests * Fix tests * Fix tests * Trigger CI * Fix test * Add names to params of encryption service interface * Remove bus from CacheServiceImpl * Add logging * Add keys to logger Co-authored-by: Emil Tullstedt * Add missing key to logger Co-authored-by: Emil Tullstedt * Undo changes in markdown files * Fix formatting * Add context to secrets service * Rename decryptSecureJsonData to decryptSecureJsonDataFn * Name args in GetDecryptedValueFn * Add template back to NewAlertmanagerNotifier * Copy GetDecryptedValueFn to ngalert * Add logging to pluginsettings * Fix pluginsettings test Co-authored-by: Tania B Co-authored-by: Emil Tullstedt --- pkg/services/ngalert/notifier/alertmanager.go | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f11b8550..c223849f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -31,7 +31,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" - "github.com/grafana/grafana/pkg/components/securejsondata" "github.com/grafana/grafana/pkg/infra/kvstore" "github.com/grafana/grafana/pkg/infra/log" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" @@ -118,9 +117,12 @@ type Alertmanager struct { config *apimodels.PostableUserConfig configHash [16]byte orgID int64 + + decryptFn channels.GetDecryptedValueFn } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, m *metrics.Alertmanager) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, + peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), @@ -133,6 +135,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k peerTimeout: cfg.UnifiedAlerting.HAPeerTimeout, Metrics: m, orgID: orgID, + decryptFn: decryptFn, } am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)) @@ -472,7 +475,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaReceiver, tmpl *template.Template) (NotificationChannel, error) { // secure settings are already encrypted at this point - secureSettings := securejsondata.SecureJsonData(make(map[string][]byte, len(r.SecureSettings))) + secureSettings := make(map[string][]byte, len(r.SecureSettings)) for k, v := range r.SecureSettings { d, err := base64.StdEncoding.DecodeString(v) @@ -501,13 +504,13 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec case "email": n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. case "pagerduty": - n, err = channels.NewPagerdutyNotifier(cfg, tmpl) + n, err = channels.NewPagerdutyNotifier(cfg, tmpl, am.decryptFn) case "pushover": - n, err = channels.NewPushoverNotifier(cfg, tmpl) + n, err = channels.NewPushoverNotifier(cfg, tmpl, am.decryptFn) case "slack": - n, err = channels.NewSlackNotifier(cfg, tmpl) + n, err = channels.NewSlackNotifier(cfg, tmpl, am.decryptFn) case "telegram": - n, err = channels.NewTelegramNotifier(cfg, tmpl) + n, err = channels.NewTelegramNotifier(cfg, tmpl, am.decryptFn) case "victorops": n, err = channels.NewVictoropsNotifier(cfg, tmpl) case "teams": @@ -517,21 +520,21 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec case "kafka": n, err = channels.NewKafkaNotifier(cfg, tmpl) case "webhook": - n, err = channels.NewWebHookNotifier(cfg, tmpl) + n, err = channels.NewWebHookNotifier(cfg, tmpl, am.decryptFn) case "sensugo": - n, err = channels.NewSensuGoNotifier(cfg, tmpl) + n, err = channels.NewSensuGoNotifier(cfg, tmpl, am.decryptFn) case "discord": n, err = channels.NewDiscordNotifier(cfg, tmpl) case "googlechat": n, err = channels.NewGoogleChatNotifier(cfg, tmpl) case "LINE": - n, err = channels.NewLineNotifier(cfg, tmpl) + n, err = channels.NewLineNotifier(cfg, tmpl, am.decryptFn) case "threema": - n, err = channels.NewThreemaNotifier(cfg, tmpl) + n, err = channels.NewThreemaNotifier(cfg, tmpl, am.decryptFn) case "opsgenie": - n, err = channels.NewOpsgenieNotifier(cfg, tmpl) + n, err = channels.NewOpsgenieNotifier(cfg, tmpl, am.decryptFn) case "prometheus-alertmanager": - n, err = channels.NewAlertmanagerNotifier(cfg, tmpl) + n, err = channels.NewAlertmanagerNotifier(cfg, tmpl, am.decryptFn) default: return nil, InvalidReceiverError{ Receiver: r, From c639e235105a7feae03330bda4cba6b39d34d011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Philippe=20Qu=C3=A9m=C3=A9ner?= Date: Fri, 8 Oct 2021 14:52:44 +0200 Subject: [PATCH 71/85] Alerting: add organziation ID to the ngAlert webhook payload (#40189) * Alerting: add organziation ID to the ngAlert webhook payload --- pkg/services/ngalert/notifier/alertmanager.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c223849f..9a827da6 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -491,6 +491,7 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec var ( cfg = &channels.NotificationChannelConfig{ UID: r.UID, + OrgID: am.orgID, Name: r.Name, Type: r.Type, DisableResolveMessage: r.DisableResolveMessage, From 3808d5d5f39e0224bee79c901fb74b0b05f7656e Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Fri, 29 Oct 2021 10:03:51 -0400 Subject: [PATCH 72/85] Alerting: Update references to alertmanager (#40904) * update module reference for alertmanager * remove workaround --- pkg/services/ngalert/notifier/alertmanager.go | 36 +++++-------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 9a827da6..0eee6ef6 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -8,7 +8,6 @@ import ( "errors" "fmt" "net/url" - "os" "path/filepath" "regexp" "strconv" @@ -31,6 +30,8 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + pb "github.com/prometheus/alertmanager/silence/silencepb" + "github.com/grafana/grafana/pkg/infra/kvstore" "github.com/grafana/grafana/pkg/infra/log" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" @@ -40,7 +41,6 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/setting" - pb "github.com/prometheus/alertmanager/silence/silencepb" ) const ( @@ -165,7 +165,12 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.orgID), am.notificationLog, m.Registerer) am.notificationLog.SetBroadcast(c.Broadcast) - am.silences, err = newSilences(silencesFilePath, m.Registerer) + // Initialize silences + am.silences, err = silence.New(silence.Options{ + Metrics: m.Registerer, + SnapshotFile: silencesFilePath, + Retention: retentionNotificationsAndSilences, + }) if err != nil { return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } @@ -190,31 +195,6 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k return am, nil } -// newSilences initializes returns *silence.Silences (from the Alertmanager) with silences taken from the file by path silencesFilePath and specific metrics registerer. -func newSilences(silencesFilePath string, registerer prometheus.Registerer) (*silence.Silences, error) { - //TODO yuriy: Replace with silencesFilePath when fix in https://github.com/prometheus/alertmanager/pull/2710 is merged. - silenceOpts := silence.Options{ - Metrics: registerer, - Retention: retentionNotificationsAndSilences, - } - - //The path is generated by the filestore. So presumably it should be safe - //nolint:gosec - silencesFileReader, err := os.Open(silencesFilePath) - if err != nil && !os.IsNotExist(err) { - return nil, err - } - - if silencesFileReader != nil { - silenceOpts.SnapshotReader = silencesFileReader - defer func(file *os.File) { - _ = file.Close() - }(silencesFileReader) - } - // Initialize silences - return silence.New(silenceOpts) -} - func (am *Alertmanager) Ready() bool { // We consider AM as ready only when the config has been // applied at least once successfully. Until then, some objects From 265a1d1ddd2b78f6a83f0f23ea6197d671bc913e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Philippe=20Qu=C3=A9m=C3=A9ner?= Date: Fri, 19 Nov 2021 16:50:55 +0100 Subject: [PATCH 73/85] Alerting: support mute timings configuration through the api for the embedded alertmanager (#41533) * Alerting: accept mute_timing_intervals through the api for the embedded alertmanager * add workaround for mutetimeinterval * add mute timings to routes * revert changes * Update pkg/services/ngalert/api/api_alertmanager.go * Update pkg/services/ngalert/api/api_alertmanager.go * Update pkg/services/ngalert/api/api_alertmanager.go * update prometheus/alertmanager dependency * add some var docs --- pkg/services/ngalert/notifier/alertmanager.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 0eee6ef6..027f976f 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -18,6 +18,7 @@ import ( gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/alertmanager/cluster" + "github.com/prometheus/alertmanager/config" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" @@ -26,6 +27,7 @@ import ( "github.com/prometheus/alertmanager/provider/mem" "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" + "github.com/prometheus/alertmanager/timeinterval" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" @@ -110,6 +112,10 @@ type Alertmanager struct { silencer *silence.Silencer silences *silence.Silences + // muteTimes is a map where the key is the name of the mute_time_interval + // and the value represents all configured time_interval(s) + muteTimes map[string][]timeinterval.TimeInterval + stageMetrics *notify.Metrics dispatcherMetrics *dispatch.DispatcherMetrics @@ -329,6 +335,14 @@ func (am *Alertmanager) templateFromPaths(paths ...string) (*template.Template, return tmpl, nil } +func (am *Alertmanager) buildMuteTimesMap(muteTimeIntervals []config.MuteTimeInterval) map[string][]timeinterval.TimeInterval { + muteTimes := make(map[string][]timeinterval.TimeInterval, len(muteTimeIntervals)) + for _, ti := range muteTimeIntervals { + muteTimes[ti.Name] = ti.TimeIntervals + } + return muteTimes +} + // applyConfig applies a new configuration by re-initializing all components using the configuration provided. // It is not safe to call concurrently. func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) (err error) { @@ -375,6 +389,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig if err != nil { return fmt.Errorf("failed to build integration map: %w", err) } + // Now, let's put together our notification pipeline routingStage := make(notify.RoutingStage, len(integrationsMap)) @@ -386,14 +401,16 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger) + am.muteTimes = am.buildMuteTimesMap(cfg.AlertmanagerConfig.MuteTimeIntervals) am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger) meshStage := notify.NewGossipSettleStage(am.peer) inhibitionStage := notify.NewMuteStage(am.inhibitor) + timeMuteStage := notify.NewTimeMuteStage(am.muteTimes) silencingStage := notify.NewMuteStage(am.silencer) for name := range integrationsMap { stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage} + routingStage[name] = notify.MultiStage{meshStage, silencingStage, timeMuteStage, inhibitionStage, stage} } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route.AsAMRoute(), nil) From 4dd9b0c801f97c7bd2bf78c2ce26fe59797e3028 Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Mon, 13 Dec 2021 09:41:53 -0500 Subject: [PATCH 74/85] Improve bridge for Alertmanager logger (#42958) * Implement go-kit/log.Logger for internal logger. --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 027f976f..0194a779 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -144,7 +144,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k decryptFn: decryptFn, } - am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)) + am.gokitLogger = logging.NewWrapper(am.logger) am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) nflogFilepath, err := am.fileStore.FilepathFor(context.TODO(), notificationLogFilename) From 5f4cf823f3a1e5fa13d932d93864909e7cffe489 Mon Sep 17 00:00:00 2001 From: smallpath Date: Thu, 16 Dec 2021 00:42:03 +0800 Subject: [PATCH 75/85] Alerting: Support WeCom as a contact point type (#40975) * add wecom notifier * fix backend lint * fix alerting channel test * update wecom doc * update notifiers * update wecom notifier test * Apply suggestions from code review Co-authored-by: gotjosh * unify wecom alerting * fix backend lint * fix front lint * fix wecom test * update docs * Update pkg/services/ngalert/notifier/channels/wecom.go Co-authored-by: gotjosh * Update docs/sources/alerting/old-alerting/notifications.md Co-authored-by: gotjosh * Update docs/sources/alerting/old-alerting/notifications.md Co-authored-by: gotjosh * Update docs/sources/alerting/old-alerting/notifications.md Co-authored-by: gotjosh * remove old wecom notifier * remove old notifier doc * fix backend test * Update docs/sources/alerting/unified-alerting/contact-points.md Co-authored-by: gotjosh * fix doc style Co-authored-by: gotjosh --- pkg/services/ngalert/notifier/alertmanager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 0194a779..74b79ff7 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -519,6 +519,8 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec n, err = channels.NewKafkaNotifier(cfg, tmpl) case "webhook": n, err = channels.NewWebHookNotifier(cfg, tmpl, am.decryptFn) + case "wecom": + n, err = channels.NewWeComNotifier(cfg, tmpl, am.decryptFn) case "sensugo": n, err = channels.NewSensuGoNotifier(cfg, tmpl, am.decryptFn) case "discord": From 193c801b2ec784c0c041c274725ddda17a363dd8 Mon Sep 17 00:00:00 2001 From: idafurjes <36131195+idafurjes@users.noreply.github.com> Date: Tue, 28 Dec 2021 10:26:18 +0100 Subject: [PATCH 76/85] Chore: Remove context.TODO (#43458) * Remove context.TODO() from services * Fix live test --- pkg/services/ngalert/notifier/alertmanager.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 74b79ff7..5928d5bf 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -127,7 +127,7 @@ type Alertmanager struct { decryptFn channels.GetDecryptedValueFn } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, +func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, @@ -147,11 +147,11 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k am.gokitLogger = logging.NewWrapper(am.logger) am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) - nflogFilepath, err := am.fileStore.FilepathFor(context.TODO(), notificationLogFilename) + nflogFilepath, err := am.fileStore.FilepathFor(ctx, notificationLogFilename) if err != nil { return nil, err } - silencesFilePath, err := am.fileStore.FilepathFor(context.TODO(), silencesFilename) + silencesFilePath, err := am.fileStore.FilepathFor(ctx, silencesFilename) if err != nil { return nil, err } @@ -162,7 +162,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k nflog.WithRetention(retentionNotificationsAndSilences), nflog.WithSnapshot(nflogFilepath), nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done, func() (int64, error) { - return am.fileStore.Persist(context.TODO(), notificationLogFilename, am.notificationLog) + return am.fileStore.Persist(ctx, notificationLogFilename, am.notificationLog) }), ) if err != nil { @@ -187,7 +187,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k am.wg.Add(1) go func() { am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) { - return am.fileStore.Persist(context.TODO(), silencesFilename, am.silences) + return am.fileStore.Persist(ctx, silencesFilename, am.silences) }) am.wg.Done() }() From 029028caa2fa0ea9e8edbf5d5ff18af135844607 Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Fri, 7 Jan 2022 03:40:09 -0500 Subject: [PATCH 77/85] Alerting: Remove bridge between log15 and go-kit logger (#43769) * remove bridge between log15 and go-kit logger. * fix tests --- pkg/services/ngalert/notifier/alertmanager.go | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 5928d5bf..c7311ca3 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -15,7 +15,6 @@ import ( "time" "unicode/utf8" - gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/alertmanager/cluster" "github.com/prometheus/alertmanager/config" @@ -37,7 +36,6 @@ import ( "github.com/grafana/grafana/pkg/infra/kvstore" "github.com/grafana/grafana/pkg/infra/log" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" - "github.com/grafana/grafana/pkg/services/ngalert/logging" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" @@ -86,8 +84,7 @@ type ClusterPeer interface { } type Alertmanager struct { - logger log.Logger - gokitLogger gokit_log.Logger + logger log.Logger Settings *setting.Cfg Store store.AlertingStore @@ -144,7 +141,6 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s decryptFn: decryptFn, } - am.gokitLogger = logging.NewWrapper(am.logger) am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) nflogFilepath, err := am.fileStore.FilepathFor(ctx, notificationLogFilename) @@ -193,7 +189,7 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s }() // Initialize in-memory alerts - am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, nil, am.gokitLogger) + am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, nil, am.logger) if err != nil { return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) } @@ -400,9 +396,9 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig am.dispatcher.Stop() } - am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger) + am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.logger) am.muteTimes = am.buildMuteTimesMap(cfg.AlertmanagerConfig.MuteTimeIntervals) - am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger) + am.silencer = silence.NewSilencer(am.silences, am.marker, am.logger) meshStage := notify.NewGossipSettleStage(am.peer) inhibitionStage := notify.NewMuteStage(am.inhibitor) @@ -414,7 +410,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route.AsAMRoute(), nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.logger, am.dispatcherMetrics) am.wg.Add(1) go func() { From 119398b38c213bab0fc9868c5f3f84f6dd179ea3 Mon Sep 17 00:00:00 2001 From: Serge Zaitsev Date: Wed, 26 Jan 2022 16:42:40 +0100 Subject: [PATCH 78/85] Chore: Remove bus from ngalert (#44465) * pass notification service down to the notifiers * add ns to all notifiers * remove bus from ngalert notifiers * use smaller interfaces for notificationservice * attempt to fix the tests * remove unused struct field * simplify notification service mock * trying to resolve issues in the tests * make linter happy * make linter even happier * linter, you are annoying --- pkg/services/ngalert/notifier/alertmanager.go | 69 ++++++++++--------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index c7311ca3..ec74131a 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -40,6 +40,7 @@ import ( ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" "github.com/grafana/grafana/pkg/services/ngalert/store" + "github.com/grafana/grafana/pkg/services/notifications" "github.com/grafana/grafana/pkg/setting" ) @@ -86,10 +87,11 @@ type ClusterPeer interface { type Alertmanager struct { logger log.Logger - Settings *setting.Cfg - Store store.AlertingStore - fileStore *FileStore - Metrics *metrics.Alertmanager + Settings *setting.Cfg + Store store.AlertingStore + fileStore *FileStore + Metrics *metrics.Alertmanager + NotificationService notifications.Service notificationLog *nflog.Log marker types.Marker @@ -125,20 +127,21 @@ type Alertmanager struct { } func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, - peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, m *metrics.Alertmanager) (*Alertmanager, error) { + peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, ns notifications.Service, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ - Settings: cfg, - stopc: make(chan struct{}), - logger: log.New("alertmanager", "org", orgID), - marker: types.NewMarker(m.Registerer), - stageMetrics: notify.NewMetrics(m.Registerer), - dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), - Store: store, - peer: peer, - peerTimeout: cfg.UnifiedAlerting.HAPeerTimeout, - Metrics: m, - orgID: orgID, - decryptFn: decryptFn, + Settings: cfg, + stopc: make(chan struct{}), + logger: log.New("alertmanager", "org", orgID), + marker: types.NewMarker(m.Registerer), + stageMetrics: notify.NewMetrics(m.Registerer), + dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), + Store: store, + peer: peer, + peerTimeout: cfg.UnifiedAlerting.HAPeerTimeout, + Metrics: m, + NotificationService: ns, + orgID: orgID, + decryptFn: decryptFn, } am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) @@ -496,39 +499,39 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec ) switch r.Type { case "email": - n, err = channels.NewEmailNotifier(cfg, tmpl) // Email notifier already has a default template. + n, err = channels.NewEmailNotifier(cfg, am.NotificationService, tmpl) // Email notifier already has a default template. case "pagerduty": - n, err = channels.NewPagerdutyNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewPagerdutyNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "pushover": - n, err = channels.NewPushoverNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewPushoverNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "slack": n, err = channels.NewSlackNotifier(cfg, tmpl, am.decryptFn) case "telegram": - n, err = channels.NewTelegramNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewTelegramNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "victorops": - n, err = channels.NewVictoropsNotifier(cfg, tmpl) + n, err = channels.NewVictoropsNotifier(cfg, am.NotificationService, tmpl) case "teams": - n, err = channels.NewTeamsNotifier(cfg, tmpl) + n, err = channels.NewTeamsNotifier(cfg, am.NotificationService, tmpl) case "dingding": - n, err = channels.NewDingDingNotifier(cfg, tmpl) + n, err = channels.NewDingDingNotifier(cfg, am.NotificationService, tmpl) case "kafka": - n, err = channels.NewKafkaNotifier(cfg, tmpl) + n, err = channels.NewKafkaNotifier(cfg, am.NotificationService, tmpl) case "webhook": - n, err = channels.NewWebHookNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewWebHookNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "wecom": - n, err = channels.NewWeComNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewWeComNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "sensugo": - n, err = channels.NewSensuGoNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewSensuGoNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "discord": - n, err = channels.NewDiscordNotifier(cfg, tmpl) + n, err = channels.NewDiscordNotifier(cfg, am.NotificationService, tmpl) case "googlechat": - n, err = channels.NewGoogleChatNotifier(cfg, tmpl) + n, err = channels.NewGoogleChatNotifier(cfg, am.NotificationService, tmpl) case "LINE": - n, err = channels.NewLineNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewLineNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "threema": - n, err = channels.NewThreemaNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewThreemaNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "opsgenie": - n, err = channels.NewOpsgenieNotifier(cfg, tmpl, am.decryptFn) + n, err = channels.NewOpsgenieNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) case "prometheus-alertmanager": n, err = channels.NewAlertmanagerNotifier(cfg, tmpl, am.decryptFn) default: From acf37a5de183b61c0159c55d4fc19ea3c5d4b582 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Wed, 9 Feb 2022 09:22:09 +0000 Subject: [PATCH 79/85] Add context.Context to AlertingStore (#45069) --- pkg/services/ngalert/notifier/alertmanager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index ec74131a..2ebe423b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -232,7 +232,7 @@ func (am *Alertmanager) StopAndWait() { // SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyDefaultConfig() error { +func (am *Alertmanager) SaveAndApplyDefaultConfig(ctx context.Context) error { am.reloadConfigMtx.Lock() defer am.reloadConfigMtx.Unlock() @@ -248,7 +248,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { return err } - err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { + err = am.Store.SaveAlertmanagerConfigurationWithCallback(ctx, cmd, func() error { if err := am.applyConfig(cfg, []byte(am.Settings.UnifiedAlerting.DefaultConfiguration)); err != nil { return err } @@ -263,7 +263,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error { // SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. // It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error { +func (am *Alertmanager) SaveAndApplyConfig(ctx context.Context, cfg *apimodels.PostableUserConfig) error { rawConfig, err := json.Marshal(&cfg) if err != nil { return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err) @@ -278,7 +278,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er OrgID: am.orgID, } - err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error { + err = am.Store.SaveAlertmanagerConfigurationWithCallback(ctx, cmd, func() error { if err := am.applyConfig(cfg, rawConfig); err != nil { return err } From 41a20d993b00d7662f72f952bb1c8b9d41fdeb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Philippe=20Qu=C3=A9m=C3=A9ner?= Date: Tue, 15 Mar 2022 00:27:10 +0100 Subject: [PATCH 80/85] Alerting: refactor receiver validation to be reusable (#46103) --- pkg/services/ngalert/notifier/alertmanager.go | 59 ++++--------------- 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 2ebe423b..2a5c1374 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -451,11 +451,6 @@ func (am *Alertmanager) buildIntegrationsMap(receivers []*apimodels.PostableApiR return integrationsMap, nil } -type NotificationChannel interface { - notify.Notifier - notify.ResolvedSender -} - // buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { var integrations []notify.Integration @@ -469,7 +464,7 @@ func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableAp return integrations, nil } -func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaReceiver, tmpl *template.Template) (NotificationChannel, error) { +func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaReceiver, tmpl *template.Template) (channels.NotificationChannel, error) { // secure settings are already encrypted at this point secureSettings := make(map[string][]byte, len(r.SecureSettings)) @@ -494,60 +489,28 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec Settings: r.Settings, SecureSettings: secureSettings, } - n NotificationChannel - err error ) - switch r.Type { - case "email": - n, err = channels.NewEmailNotifier(cfg, am.NotificationService, tmpl) // Email notifier already has a default template. - case "pagerduty": - n, err = channels.NewPagerdutyNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "pushover": - n, err = channels.NewPushoverNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "slack": - n, err = channels.NewSlackNotifier(cfg, tmpl, am.decryptFn) - case "telegram": - n, err = channels.NewTelegramNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "victorops": - n, err = channels.NewVictoropsNotifier(cfg, am.NotificationService, tmpl) - case "teams": - n, err = channels.NewTeamsNotifier(cfg, am.NotificationService, tmpl) - case "dingding": - n, err = channels.NewDingDingNotifier(cfg, am.NotificationService, tmpl) - case "kafka": - n, err = channels.NewKafkaNotifier(cfg, am.NotificationService, tmpl) - case "webhook": - n, err = channels.NewWebHookNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "wecom": - n, err = channels.NewWeComNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "sensugo": - n, err = channels.NewSensuGoNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "discord": - n, err = channels.NewDiscordNotifier(cfg, am.NotificationService, tmpl) - case "googlechat": - n, err = channels.NewGoogleChatNotifier(cfg, am.NotificationService, tmpl) - case "LINE": - n, err = channels.NewLineNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "threema": - n, err = channels.NewThreemaNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "opsgenie": - n, err = channels.NewOpsgenieNotifier(cfg, am.NotificationService, tmpl, am.decryptFn) - case "prometheus-alertmanager": - n, err = channels.NewAlertmanagerNotifier(cfg, tmpl, am.decryptFn) - default: + factoryConfig, err := channels.NewFactoryConfig(cfg, am.NotificationService, am.decryptFn, tmpl) + if err != nil { + return nil, InvalidReceiverError{ + Receiver: r, + Err: err, + } + } + receiverFactory, exists := channels.Factory(r.Type) + if !exists { return nil, InvalidReceiverError{ Receiver: r, Err: fmt.Errorf("notifier %s is not supported", r.Type), } } - + n, err := receiverFactory(factoryConfig) if err != nil { return nil, InvalidReceiverError{ Receiver: r, Err: err, } } - return n, nil } From cb22a3bd07affbfa4a8efe7e905b82dd95c795bc Mon Sep 17 00:00:00 2001 From: Joe Blubaugh Date: Wed, 23 Mar 2022 16:49:02 +0800 Subject: [PATCH 81/85] Unified Alerting, Issue 41156: Clean up expired silences. (#46740) Expired silences older than the retention period were not being cleaned up. The root problem was that notifier.Alertmanager overrides the Prometheus alert manager's silence maintenance function and was not calling Silences.GC() in the overriden function. --- pkg/services/ngalert/notifier/alertmanager.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 2a5c1374..f6aefcea 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -49,8 +49,6 @@ const ( silencesFilename = "silences" workingDir = "alerting" - // How long should we keep silences and notification entries on-disk after they've served their purpose. - retentionNotificationsAndSilences = 5 * 24 * time.Hour // maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences maintenanceNotificationAndSilences = 15 * time.Minute // defaultResolveTimeout is the default timeout used for resolving an alert @@ -60,6 +58,10 @@ const ( memoryAlertsGCInterval = 30 * time.Minute ) +// How long should we keep silences and notification entries on-disk after they've served their purpose. +var retentionNotificationsAndSilences = 5 * 24 * time.Hour +var silenceMaintenanceInterval = 15 * time.Minute + func init() { silence.ValidateMatcher = func(m *pb.Matcher) error { switch m.Type { @@ -185,7 +187,14 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s am.wg.Add(1) go func() { - am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) { + am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) { + // Delete silences older than the retention period. + if _, err := am.silences.GC(); err != nil { + am.logger.Error("Silence Garbage Collection Failed at %v: %v", time.Now(), err) + // Don't return here - we need to snapshot our state first. + } + + // Snapshot our silences to the Grafana KV store return am.fileStore.Persist(ctx, silencesFilename, am.silences) }) am.wg.Done() From f84c1e51b6af3a106c8504dc6171fde98c19c3f7 Mon Sep 17 00:00:00 2001 From: Joe Blubaugh Date: Wed, 23 Mar 2022 15:07:07 +0100 Subject: [PATCH 82/85] Unified Alerting: Make log message follow codebase convention. (#46881) 1. Keep log lines lower case. 2. The key-value pair arguments are not format argument for the string. 3. Always use the "err" key. --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f6aefcea..a42d7252 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -190,7 +190,7 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) { // Delete silences older than the retention period. if _, err := am.silences.GC(); err != nil { - am.logger.Error("Silence Garbage Collection Failed at %v: %v", time.Now(), err) + am.logger.Error("silence garbage collection", "err", err) // Don't return here - we need to snapshot our state first. } From 538d23f3dbd3850c9d46d9d99f8d3563cd553b25 Mon Sep 17 00:00:00 2001 From: Joe Blubaugh Date: Mon, 23 May 2022 14:24:20 +0800 Subject: [PATCH 83/85] Alerting: Attach screenshot data to Slack notifications. (#49374) This change extracts screenshot data from alert messages via a private annotation `__alertScreenshotToken__` and attaches a URL to a Slack message or uploads the data to an image upload endpoint if needed. This change also implements a few foundational functions for use in other notifiers. --- pkg/services/ngalert/notifier/alertmanager.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index a42d7252..7d1ff55c 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -86,11 +86,16 @@ type ClusterPeer interface { WaitReady(context.Context) error } +type AlertingStore interface { + store.AlertingStore + channels.ImageStore +} + type Alertmanager struct { logger log.Logger Settings *setting.Cfg - Store store.AlertingStore + Store AlertingStore fileStore *FileStore Metrics *metrics.Alertmanager NotificationService notifications.Service @@ -128,7 +133,7 @@ type Alertmanager struct { decryptFn channels.GetDecryptedValueFn } -func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, +func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, ns notifications.Service, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, @@ -499,7 +504,7 @@ func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaRec SecureSettings: secureSettings, } ) - factoryConfig, err := channels.NewFactoryConfig(cfg, am.NotificationService, am.decryptFn, tmpl) + factoryConfig, err := channels.NewFactoryConfig(cfg, am.NotificationService, am.decryptFn, tmpl, am.Store) if err != nil { return nil, InvalidReceiverError{ Receiver: r, From d1eb91bc96977c691e0fb48ea6cf0a4a600f684f Mon Sep 17 00:00:00 2001 From: Joe Blubaugh Date: Thu, 26 May 2022 13:29:56 +0800 Subject: [PATCH 84/85] Alerting: Add stored screenshot utilities to the channels package. (#49470) Adds three functions: `withStoredImages` iterates over a list of models.Alerts, extracting a stored image's data from storage, if available, and executing a user-provided function. `withStoredImage` does this for an image attached to a specific alert. `openImage` finds and opens an image file on disk. Moves `store.Image` to `models.Image` Simplifies `channels.ImageStore` interface and updates notifiers that use it to use the simpler methods. Updates all pkg/alert/notifier/channels to use withStoredImage routines. --- pkg/services/ngalert/notifier/alertmanager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index 7d1ff55c..7243a9ac 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -88,7 +88,7 @@ type ClusterPeer interface { type AlertingStore interface { store.AlertingStore - channels.ImageStore + store.ImageStore } type Alertmanager struct { From 9350b61910139d6982eeb2d5b5b53b2e07340f32 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 14 Jul 2022 12:36:19 +0100 Subject: [PATCH 85/85] Move to the alerting package --- pkg/services/ngalert/notifier/alertmanager.go | 704 ------------------ 1 file changed, 704 deletions(-) delete mode 100644 pkg/services/ngalert/notifier/alertmanager.go diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go deleted file mode 100644 index 7243a9ac..00000000 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ /dev/null @@ -1,704 +0,0 @@ -package notifier - -import ( - "context" - "crypto/md5" - "encoding/base64" - "encoding/json" - "errors" - "fmt" - "net/url" - "path/filepath" - "regexp" - "strconv" - "sync" - "time" - "unicode/utf8" - - amv2 "github.com/prometheus/alertmanager/api/v2/models" - "github.com/prometheus/alertmanager/cluster" - "github.com/prometheus/alertmanager/config" - "github.com/prometheus/alertmanager/dispatch" - "github.com/prometheus/alertmanager/inhibit" - "github.com/prometheus/alertmanager/nflog" - "github.com/prometheus/alertmanager/nflog/nflogpb" - "github.com/prometheus/alertmanager/notify" - "github.com/prometheus/alertmanager/provider/mem" - "github.com/prometheus/alertmanager/silence" - "github.com/prometheus/alertmanager/template" - "github.com/prometheus/alertmanager/timeinterval" - "github.com/prometheus/alertmanager/types" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/model" - - pb "github.com/prometheus/alertmanager/silence/silencepb" - - "github.com/grafana/grafana/pkg/infra/kvstore" - "github.com/grafana/grafana/pkg/infra/log" - apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" - "github.com/grafana/grafana/pkg/services/ngalert/metrics" - ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" - "github.com/grafana/grafana/pkg/services/ngalert/notifier/channels" - "github.com/grafana/grafana/pkg/services/ngalert/store" - "github.com/grafana/grafana/pkg/services/notifications" - "github.com/grafana/grafana/pkg/setting" -) - -const ( - notificationLogFilename = "notifications" - silencesFilename = "silences" - - workingDir = "alerting" - // maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences - maintenanceNotificationAndSilences = 15 * time.Minute - // defaultResolveTimeout is the default timeout used for resolving an alert - // if the end time is not specified. - defaultResolveTimeout = 5 * time.Minute - // memoryAlertsGCInterval is the interval at which we'll remove resolved alerts from memory. - memoryAlertsGCInterval = 30 * time.Minute -) - -// How long should we keep silences and notification entries on-disk after they've served their purpose. -var retentionNotificationsAndSilences = 5 * 24 * time.Hour -var silenceMaintenanceInterval = 15 * time.Minute - -func init() { - silence.ValidateMatcher = func(m *pb.Matcher) error { - switch m.Type { - case pb.Matcher_EQUAL, pb.Matcher_NOT_EQUAL: - if !model.LabelValue(m.Pattern).IsValid() { - return fmt.Errorf("invalid label value %q", m.Pattern) - } - case pb.Matcher_REGEXP, pb.Matcher_NOT_REGEXP: - if _, err := regexp.Compile(m.Pattern); err != nil { - return fmt.Errorf("invalid regular expression %q: %s", m.Pattern, err) - } - default: - return fmt.Errorf("unknown matcher type %q", m.Type) - } - return nil - } -} - -type ClusterPeer interface { - AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel - Position() int - WaitReady(context.Context) error -} - -type AlertingStore interface { - store.AlertingStore - store.ImageStore -} - -type Alertmanager struct { - logger log.Logger - - Settings *setting.Cfg - Store AlertingStore - fileStore *FileStore - Metrics *metrics.Alertmanager - NotificationService notifications.Service - - notificationLog *nflog.Log - marker types.Marker - alerts *mem.Alerts - route *dispatch.Route - peer ClusterPeer - peerTimeout time.Duration - - dispatcher *dispatch.Dispatcher - inhibitor *inhibit.Inhibitor - // wg is for dispatcher, inhibitor, silences and notifications - // Across configuration changes dispatcher and inhibitor are completely replaced, however, silences, notification log and alerts remain the same. - // stopc is used to let silences and notifications know we are done. - wg sync.WaitGroup - stopc chan struct{} - - silencer *silence.Silencer - silences *silence.Silences - - // muteTimes is a map where the key is the name of the mute_time_interval - // and the value represents all configured time_interval(s) - muteTimes map[string][]timeinterval.TimeInterval - - stageMetrics *notify.Metrics - dispatcherMetrics *dispatch.DispatcherMetrics - - reloadConfigMtx sync.RWMutex - config *apimodels.PostableUserConfig - configHash [16]byte - orgID int64 - - decryptFn channels.GetDecryptedValueFn -} - -func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store AlertingStore, kvStore kvstore.KVStore, - peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, ns notifications.Service, m *metrics.Alertmanager) (*Alertmanager, error) { - am := &Alertmanager{ - Settings: cfg, - stopc: make(chan struct{}), - logger: log.New("alertmanager", "org", orgID), - marker: types.NewMarker(m.Registerer), - stageMetrics: notify.NewMetrics(m.Registerer), - dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), - Store: store, - peer: peer, - peerTimeout: cfg.UnifiedAlerting.HAPeerTimeout, - Metrics: m, - NotificationService: ns, - orgID: orgID, - decryptFn: decryptFn, - } - - am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath()) - - nflogFilepath, err := am.fileStore.FilepathFor(ctx, notificationLogFilename) - if err != nil { - return nil, err - } - silencesFilePath, err := am.fileStore.FilepathFor(ctx, silencesFilename) - if err != nil { - return nil, err - } - - // Initialize the notification log - am.wg.Add(1) - am.notificationLog, err = nflog.New( - nflog.WithRetention(retentionNotificationsAndSilences), - nflog.WithSnapshot(nflogFilepath), - nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done, func() (int64, error) { - return am.fileStore.Persist(ctx, notificationLogFilename, am.notificationLog) - }), - ) - if err != nil { - return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) - } - c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.orgID), am.notificationLog, m.Registerer) - am.notificationLog.SetBroadcast(c.Broadcast) - - // Initialize silences - am.silences, err = silence.New(silence.Options{ - Metrics: m.Registerer, - SnapshotFile: silencesFilePath, - Retention: retentionNotificationsAndSilences, - }) - if err != nil { - return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) - } - - c = am.peer.AddState(fmt.Sprintf("silences:%d", am.orgID), am.silences, m.Registerer) - am.silences.SetBroadcast(c.Broadcast) - - am.wg.Add(1) - go func() { - am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) { - // Delete silences older than the retention period. - if _, err := am.silences.GC(); err != nil { - am.logger.Error("silence garbage collection", "err", err) - // Don't return here - we need to snapshot our state first. - } - - // Snapshot our silences to the Grafana KV store - return am.fileStore.Persist(ctx, silencesFilename, am.silences) - }) - am.wg.Done() - }() - - // Initialize in-memory alerts - am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, nil, am.logger) - if err != nil { - return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err) - } - - return am, nil -} - -func (am *Alertmanager) Ready() bool { - // We consider AM as ready only when the config has been - // applied at least once successfully. Until then, some objects - // can still be nil. - am.reloadConfigMtx.RLock() - defer am.reloadConfigMtx.RUnlock() - - return am.ready() -} - -func (am *Alertmanager) ready() bool { - return am.config != nil -} - -func (am *Alertmanager) StopAndWait() { - if am.dispatcher != nil { - am.dispatcher.Stop() - } - - if am.inhibitor != nil { - am.inhibitor.Stop() - } - - am.alerts.Close() - - close(am.stopc) - - am.wg.Wait() -} - -// SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager. -// It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyDefaultConfig(ctx context.Context) error { - am.reloadConfigMtx.Lock() - defer am.reloadConfigMtx.Unlock() - - cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ - AlertmanagerConfiguration: am.Settings.UnifiedAlerting.DefaultConfiguration, - Default: true, - ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: am.orgID, - } - - cfg, err := Load([]byte(am.Settings.UnifiedAlerting.DefaultConfiguration)) - if err != nil { - return err - } - - err = am.Store.SaveAlertmanagerConfigurationWithCallback(ctx, cmd, func() error { - if err := am.applyConfig(cfg, []byte(am.Settings.UnifiedAlerting.DefaultConfiguration)); err != nil { - return err - } - return nil - }) - if err != nil { - return err - } - - return nil -} - -// SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager. -// It rollbacks the save if we fail to apply the configuration. -func (am *Alertmanager) SaveAndApplyConfig(ctx context.Context, cfg *apimodels.PostableUserConfig) error { - rawConfig, err := json.Marshal(&cfg) - if err != nil { - return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err) - } - - am.reloadConfigMtx.Lock() - defer am.reloadConfigMtx.Unlock() - - cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{ - AlertmanagerConfiguration: string(rawConfig), - ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion), - OrgID: am.orgID, - } - - err = am.Store.SaveAlertmanagerConfigurationWithCallback(ctx, cmd, func() error { - if err := am.applyConfig(cfg, rawConfig); err != nil { - return err - } - return nil - }) - if err != nil { - return err - } - - return nil -} - -// ApplyConfig applies the configuration to the Alertmanager. -func (am *Alertmanager) ApplyConfig(dbCfg *ngmodels.AlertConfiguration) error { - var err error - cfg, err := Load([]byte(dbCfg.AlertmanagerConfiguration)) - if err != nil { - return fmt.Errorf("failed to parse Alertmanager config: %w", err) - } - - am.reloadConfigMtx.Lock() - defer am.reloadConfigMtx.Unlock() - - if err = am.applyConfig(cfg, nil); err != nil { - return fmt.Errorf("unable to apply configuration: %w", err) - } - return nil -} - -func (am *Alertmanager) getTemplate() (*template.Template, error) { - am.reloadConfigMtx.RLock() - defer am.reloadConfigMtx.RUnlock() - if !am.ready() { - return nil, errors.New("alertmanager is not initialized") - } - paths := make([]string, 0, len(am.config.TemplateFiles)) - for name := range am.config.TemplateFiles { - paths = append(paths, filepath.Join(am.WorkingDirPath(), name)) - } - return am.templateFromPaths(paths...) -} - -func (am *Alertmanager) templateFromPaths(paths ...string) (*template.Template, error) { - tmpl, err := template.FromGlobs(paths...) - if err != nil { - return nil, err - } - externalURL, err := url.Parse(am.Settings.AppURL) - if err != nil { - return nil, err - } - tmpl.ExternalURL = externalURL - return tmpl, nil -} - -func (am *Alertmanager) buildMuteTimesMap(muteTimeIntervals []config.MuteTimeInterval) map[string][]timeinterval.TimeInterval { - muteTimes := make(map[string][]timeinterval.TimeInterval, len(muteTimeIntervals)) - for _, ti := range muteTimeIntervals { - muteTimes[ti.Name] = ti.TimeIntervals - } - return muteTimes -} - -// applyConfig applies a new configuration by re-initializing all components using the configuration provided. -// It is not safe to call concurrently. -func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig []byte) (err error) { - // First, let's make sure this config is not already loaded - var configChanged bool - if rawConfig == nil { - enc, err := json.Marshal(cfg.AlertmanagerConfig) - if err != nil { - // In theory, this should never happen. - return err - } - rawConfig = enc - } - - if am.configHash != md5.Sum(rawConfig) { - configChanged = true - } - - if cfg.TemplateFiles == nil { - cfg.TemplateFiles = map[string]string{} - } - cfg.TemplateFiles["__default__.tmpl"] = channels.DefaultTemplateString - - // next, we need to make sure we persist the templates to disk. - paths, templatesChanged, err := PersistTemplates(cfg, am.WorkingDirPath()) - if err != nil { - return err - } - - // If neither the configuration nor templates have changed, we've got nothing to do. - if !configChanged && !templatesChanged { - am.logger.Debug("neither config nor template have changed, skipping configuration sync.") - return nil - } - - // With the templates persisted, create the template list using the paths. - tmpl, err := am.templateFromPaths(paths...) - if err != nil { - return err - } - - // Finally, build the integrations map using the receiver configuration and templates. - integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl) - if err != nil { - return fmt.Errorf("failed to build integration map: %w", err) - } - - // Now, let's put together our notification pipeline - routingStage := make(notify.RoutingStage, len(integrationsMap)) - - if am.inhibitor != nil { - am.inhibitor.Stop() - } - if am.dispatcher != nil { - am.dispatcher.Stop() - } - - am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.logger) - am.muteTimes = am.buildMuteTimesMap(cfg.AlertmanagerConfig.MuteTimeIntervals) - am.silencer = silence.NewSilencer(am.silences, am.marker, am.logger) - - meshStage := notify.NewGossipSettleStage(am.peer) - inhibitionStage := notify.NewMuteStage(am.inhibitor) - timeMuteStage := notify.NewTimeMuteStage(am.muteTimes) - silencingStage := notify.NewMuteStage(am.silencer) - for name := range integrationsMap { - stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{meshStage, silencingStage, timeMuteStage, inhibitionStage, stage} - } - - am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route.AsAMRoute(), nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.logger, am.dispatcherMetrics) - - am.wg.Add(1) - go func() { - defer am.wg.Done() - am.dispatcher.Run() - }() - - am.wg.Add(1) - go func() { - defer am.wg.Done() - am.inhibitor.Run() - }() - - am.config = cfg - am.configHash = md5.Sum(rawConfig) - - return nil -} - -func (am *Alertmanager) WorkingDirPath() string { - return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(int(am.orgID))) -} - -// buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config. -func (am *Alertmanager) buildIntegrationsMap(receivers []*apimodels.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) { - integrationsMap := make(map[string][]notify.Integration, len(receivers)) - for _, receiver := range receivers { - integrations, err := am.buildReceiverIntegrations(receiver, templates) - if err != nil { - return nil, err - } - integrationsMap[receiver.Name] = integrations - } - - return integrationsMap, nil -} - -// buildReceiverIntegrations builds a list of integration notifiers off of a receiver config. -func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) { - var integrations []notify.Integration - for i, r := range receiver.GrafanaManagedReceivers { - n, err := am.buildReceiverIntegration(r, tmpl) - if err != nil { - return nil, err - } - integrations = append(integrations, notify.NewIntegration(n, n, r.Type, i)) - } - return integrations, nil -} - -func (am *Alertmanager) buildReceiverIntegration(r *apimodels.PostableGrafanaReceiver, tmpl *template.Template) (channels.NotificationChannel, error) { - // secure settings are already encrypted at this point - secureSettings := make(map[string][]byte, len(r.SecureSettings)) - - for k, v := range r.SecureSettings { - d, err := base64.StdEncoding.DecodeString(v) - if err != nil { - return nil, InvalidReceiverError{ - Receiver: r, - Err: errors.New("failed to decode secure setting"), - } - } - secureSettings[k] = d - } - - var ( - cfg = &channels.NotificationChannelConfig{ - UID: r.UID, - OrgID: am.orgID, - Name: r.Name, - Type: r.Type, - DisableResolveMessage: r.DisableResolveMessage, - Settings: r.Settings, - SecureSettings: secureSettings, - } - ) - factoryConfig, err := channels.NewFactoryConfig(cfg, am.NotificationService, am.decryptFn, tmpl, am.Store) - if err != nil { - return nil, InvalidReceiverError{ - Receiver: r, - Err: err, - } - } - receiverFactory, exists := channels.Factory(r.Type) - if !exists { - return nil, InvalidReceiverError{ - Receiver: r, - Err: fmt.Errorf("notifier %s is not supported", r.Type), - } - } - n, err := receiverFactory(factoryConfig) - if err != nil { - return nil, InvalidReceiverError{ - Receiver: r, - Err: err, - } - } - return n, nil -} - -// PutAlerts receives the alerts and then sends them through the corresponding route based on whenever the alert has a receiver embedded or not -func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error { - now := time.Now() - alerts := make([]*types.Alert, 0, len(postableAlerts.PostableAlerts)) - var validationErr *AlertValidationError - for _, a := range postableAlerts.PostableAlerts { - alert := &types.Alert{ - Alert: model.Alert{ - Labels: model.LabelSet{}, - Annotations: model.LabelSet{}, - StartsAt: time.Time(a.StartsAt), - EndsAt: time.Time(a.EndsAt), - GeneratorURL: a.GeneratorURL.String(), - }, - UpdatedAt: now, - } - - for k, v := range a.Labels { - if len(v) == 0 || k == ngmodels.NamespaceUIDLabel { // Skip empty and namespace UID labels. - continue - } - alert.Alert.Labels[model.LabelName(k)] = model.LabelValue(v) - } - for k, v := range a.Annotations { - if len(v) == 0 { // Skip empty annotation. - continue - } - alert.Alert.Annotations[model.LabelName(k)] = model.LabelValue(v) - } - - // Ensure StartsAt is set. - if alert.StartsAt.IsZero() { - if alert.EndsAt.IsZero() { - alert.StartsAt = now - } else { - alert.StartsAt = alert.EndsAt - } - } - // If no end time is defined, set a timeout after which an alert - // is marked resolved if it is not updated. - if alert.EndsAt.IsZero() { - alert.Timeout = true - alert.EndsAt = now.Add(defaultResolveTimeout) - } - - if alert.EndsAt.After(now) { - am.Metrics.Firing().Inc() - } else { - am.Metrics.Resolved().Inc() - } - - if err := validateAlert(alert); err != nil { - if validationErr == nil { - validationErr = &AlertValidationError{} - } - validationErr.Alerts = append(validationErr.Alerts, a) - validationErr.Errors = append(validationErr.Errors, err) - am.Metrics.Invalid().Inc() - continue - } - - alerts = append(alerts, alert) - } - - if err := am.alerts.Put(alerts...); err != nil { - // Notification sending alert takes precedence over validation errors. - return err - } - if validationErr != nil { - // Even if validationErr is nil, the require.NoError fails on it. - return validationErr - } - return nil -} - -// validateAlert is a.Validate() while additionally allowing -// space for label and annotation names. -func validateAlert(a *types.Alert) error { - if a.StartsAt.IsZero() { - return fmt.Errorf("start time missing") - } - if !a.EndsAt.IsZero() && a.EndsAt.Before(a.StartsAt) { - return fmt.Errorf("start time must be before end time") - } - if err := validateLabelSet(a.Labels); err != nil { - return fmt.Errorf("invalid label set: %s", err) - } - if len(a.Labels) == 0 { - return fmt.Errorf("at least one label pair required") - } - if err := validateLabelSet(a.Annotations); err != nil { - return fmt.Errorf("invalid annotations: %s", err) - } - return nil -} - -// validateLabelSet is ls.Validate() while additionally allowing -// space for label names. -func validateLabelSet(ls model.LabelSet) error { - for ln, lv := range ls { - if !isValidLabelName(ln) { - return fmt.Errorf("invalid name %q", ln) - } - if !lv.IsValid() { - return fmt.Errorf("invalid value %q", lv) - } - } - return nil -} - -// isValidLabelName is ln.IsValid() without restrictions other than it can not be empty. -// The regex for Prometheus data model is ^[a-zA-Z_][a-zA-Z0-9_]*$. -func isValidLabelName(ln model.LabelName) bool { - if len(ln) == 0 { - return false - } - - return utf8.ValidString(string(ln)) -} - -// AlertValidationError is the error capturing the validation errors -// faced on the alerts. -type AlertValidationError struct { - Alerts []amv2.PostableAlert - Errors []error // Errors[i] refers to Alerts[i]. -} - -func (e AlertValidationError) Error() string { - errMsg := "" - if len(e.Errors) != 0 { - errMsg = e.Errors[0].Error() - for _, e := range e.Errors[1:] { - errMsg += ";" + e.Error() - } - } - return errMsg -} - -// createReceiverStage creates a pipeline of stages for a receiver. -func (am *Alertmanager) createReceiverStage(name string, integrations []notify.Integration, wait func() time.Duration, notificationLog notify.NotificationLog) notify.Stage { - var fs notify.FanoutStage - for i := range integrations { - recv := &nflogpb.Receiver{ - GroupName: name, - Integration: integrations[i].Name(), - Idx: uint32(integrations[i].Index()), - } - var s notify.MultiStage - s = append(s, notify.NewWaitStage(wait)) - s = append(s, notify.NewDedupStage(&integrations[i], notificationLog, recv)) - s = append(s, notify.NewRetryStage(integrations[i], name, am.stageMetrics)) - s = append(s, notify.NewSetNotifiesStage(notificationLog, recv)) - - fs = append(fs, s) - } - return fs -} - -func (am *Alertmanager) waitFunc() time.Duration { - return time.Duration(am.peer.Position()) * am.peerTimeout -} - -func (am *Alertmanager) timeoutFunc(d time.Duration) time.Duration { - // time.Duration d relates to the receiver's group_interval. Even with a group interval of 1s, - // we need to make sure (non-position-0) peers in the cluster wait before flushing the notifications. - if d < notify.MinTimeout { - d = notify.MinTimeout - } - return d + am.waitFunc() -} - -type nilLimits struct{} - -func (n nilLimits) MaxNumberOfAggregationGroups() int { return 0 }