promtail retry 429 rate limit errors, clarifying risks in the docs of…

… configuring multiple client sections in promtail, also increased the backoff and retry settings in promtail. Signed-off-by: Edward Welch <edward.welch@grafana.com>
grafana · Mar 23, 2020 · 6841c41 · 6841c41
1 parent 30303c6
commit 6841c41
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 6 deletions.
diff --git a/docs/clients/promtail/configuration.md b/docs/clients/promtail/configuration.md
@@ -68,6 +68,11 @@ Supported contents and default values of `config.yaml`:
 
 # Describes how Promtail connects to multiple instances
 # of Loki, sending logs to each.
+# WARNING: If one of the remote Loki servers fails to respond or responds 
+# with any error which is retriable, this will impact sending logs to any 
+# other configured remote Loki servers.  Sending is done on a single thread!
+# It is generally recommended to run multiple promtail clients in parallel
+# if you want to send to multiple remote Loki instances.
 clients:
   - [<client_config>]
 

diff --git a/pkg/promtail/client/client.go b/pkg/promtail/client/client.go
@@ -234,8 +234,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
 			return
 		}
 
-		// Only retry 500s and connection-level errors.
-		if status > 0 && status/100 != 5 {
+		// Only retry 429s, 500s and connection-level errors.
+		if status > 0 && status != 429 && status/100 != 5 {
 			break
 		}
 

diff --git a/pkg/promtail/client/client_test.go b/pkg/promtail/client/client_test.go
@@ -152,6 +152,35 @@ func TestClient_Handle(t *testing.T) {
 				promtail_sent_entries_total{host="__HOST__"} 0
 			`,
 		},
+		"do retry sending a batch in case the server responds with a 429": {
+			clientBatchSize:      10,
+			clientBatchWait:      10 * time.Millisecond,
+			clientMaxRetries:     3,
+			serverResponseStatus: 429,
+			inputEntries:         []entry{logEntries[0]},
+			expectedReqs: []receivedReq{
+				{
+					tenantID: "",
+					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
+				},
+				{
+					tenantID: "",
+					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
+				},
+				{
+					tenantID: "",
+					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
+				},
+			},
+			expectedMetrics: `
+				# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
+				# TYPE promtail_dropped_entries_total counter
+				promtail_dropped_entries_total{host="__HOST__"} 1.0
+				# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
+				# TYPE promtail_sent_entries_total counter
+				promtail_sent_entries_total{host="__HOST__"} 0
+			`,
+		},
 		"batch log entries together honoring the client tenant ID": {
 			clientBatchSize:      100,
 			clientBatchWait:      100 * time.Millisecond,

diff --git a/pkg/promtail/client/config.go b/pkg/promtail/client/config.go
@@ -34,10 +34,10 @@ func (c *Config) RegisterFlags(flags *flag.FlagSet) {
 	flags.Var(&c.URL, "client.url", "URL of log server")
 	flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.")
 	flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ")
-
-	flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 5, "Maximum number of retires when sending batches.")
-	flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 100*time.Millisecond, "Initial backoff time between retries.")
-	flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Second, "Maximum backoff time between retries.")
+	// Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost
+	flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 10, "Maximum number of retires when sending batches.")
+	flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.")
+	flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.")
 	flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request")
 	flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")