From 8a222ce2ac3149001c097eb2f69e269665c1b53d Mon Sep 17 00:00:00 2001 From: Calle Pettersson Date: Fri, 2 Dec 2022 22:07:38 +0100 Subject: [PATCH] Handle errors in Slack's postMessage API For larger organisations, using the postMessage API has advantages over webhooks, such as not being tied to a specific user (who might leave the company at some point, resulting in the webhook integration becoming disabled), and posting to multiple channels. Alertmanager's Slack integration actually works with this endpoint out of the box, by setting the URL path to /api/chat.postMessage. However, error handling works a bit differently on this endpoint, always returning 200 OK even when the posted message is rejected. This commit handles the response format from this endpoint, so that errors are reported correctly. --- notify/slack/slack.go | 30 +++++++++++++++- notify/slack/slack_test.go | 70 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/notify/slack/slack.go b/notify/slack/slack.go index 5f506f7feb..4c55837c55 100644 --- a/notify/slack/slack.go +++ b/notify/slack/slack.go @@ -85,6 +85,11 @@ type attachment struct { MrkdwnIn []string `json:"mrkdwn_in,omitempty"` } +type postMessageResponse struct { + Ok bool `json:"ok"` + Error string `json:"error"` +} + // Notify implements the Notifier interface. func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) { var err error @@ -205,10 +210,33 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) } defer notify.Drain(resp) - // Only 5xx response codes are recoverable and 2xx codes are successful. + // Only 5xx response codes are recoverable and 2xx codes are successful (unless using the + // postMessage API, see below). // https://api.slack.com/incoming-webhooks#handling_errors // https://api.slack.com/changelog/2016-05-17-changes-to-errors-for-incoming-webhooks retry, err := n.retrier.Check(resp.StatusCode, resp.Body) err = errors.Wrap(err, fmt.Sprintf("channel %q", req.Channel)) + + // For the postMessage API, if the status code is 200, we need to do + // additional checks, since the API returns 200 even for circumstances + // which might typically have been a 4xx response (eg invalid channel) + // https://api.slack.com/methods/chat.postMessage#errors + if resp.StatusCode == 200 && strings.HasSuffix(u, "api/chat.postMessage") { + apiResp := postMessageResponse{} + d := json.NewDecoder(resp.Body) + decErr := d.Decode(&apiResp) + if decErr != nil { + // This should only happen if the Slack API is returning non-json, + // which would presumably only happen in major outages where they + // do not process calls(?). Always retry. + err = errors.Wrap(errors.New("could not decode slack API response"), fmt.Sprintf("channel %q", req.Channel)) + retry = true + } + if !apiResp.Ok { + err = errors.Wrap(errors.New(apiResp.Error), fmt.Sprintf("channel %q", req.Channel)) + retry = false + } + } + return retry, err } diff --git a/notify/slack/slack_test.go b/notify/slack/slack_test.go index 9a4f16cf0a..621b3b9b0e 100644 --- a/notify/slack/slack_test.go +++ b/notify/slack/slack_test.go @@ -14,16 +14,23 @@ package slack import ( + "context" "fmt" + "net/http" + "net/http/httptest" + "net/url" "os" "testing" + "time" "github.com/go-kit/log" commoncfg "github.com/prometheus/common/config" + "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "github.com/prometheus/alertmanager/config" "github.com/prometheus/alertmanager/notify/test" + "github.com/prometheus/alertmanager/types" ) func TestSlackRetry(t *testing.T) { @@ -42,6 +49,69 @@ func TestSlackRetry(t *testing.T) { } } +func TestSlackPostMessageHappyPath(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"ok": true}`) + })) + defer ts.Close() + tsUrl, err := url.Parse(ts.URL) + require.NoError(t, err) + tsUrl.Path = "api/chat.postMessage" + + notifier, err := New( + &config.SlackConfig{ + HTTPConfig: &commoncfg.HTTPClientConfig{}, + APIURL: &config.SecretURL{URL: tsUrl}, + }, + test.CreateTmpl(t), + log.NewNopLogger(), + ) + require.NoError(t, err) + + _, err = notifier.Notify(context.TODO(), &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "lbl1": "val1", + }, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Hour), + }, + }) + require.NoError(t, err) +} + +func TestSlackPostMessageErrorHandling(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"ok": false, "error": "no_text"}`) + })) + defer ts.Close() + tsUrl, err := url.Parse(ts.URL) + require.NoError(t, err) + tsUrl.Path = "api/chat.postMessage" + + notifier, err := New( + &config.SlackConfig{ + HTTPConfig: &commoncfg.HTTPClientConfig{}, + APIURL: &config.SecretURL{URL: tsUrl}, + }, + test.CreateTmpl(t), + log.NewNopLogger(), + ) + require.NoError(t, err) + + retry, err := notifier.Notify(context.TODO(), &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "lbl1": "val1", + }, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Hour), + }, + }) + require.False(t, retry, "Should not retry configuration errors") + require.Error(t, err) +} + func TestSlackRedactedURL(t *testing.T) { ctx, u, fn := test.GetContextWithCancelingURL() defer fn()