Skip to content

Commit

Permalink
promtail retry 429 rate limit errors, clarifying risks in the docs of…
Browse files Browse the repository at this point in the history
… configuring multiple client sections in promtail, also increased the backoff and retry settings in promtail.

Signed-off-by: Edward Welch <edward.welch@grafana.com>
  • Loading branch information
slim-bean authored and Ed Welch committed Mar 23, 2020
1 parent 30303c6 commit 6841c41
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 6 deletions.
5 changes: 5 additions & 0 deletions docs/clients/promtail/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ Supported contents and default values of `config.yaml`:

# Describes how Promtail connects to multiple instances
# of Loki, sending logs to each.
# WARNING: If one of the remote Loki servers fails to respond or responds
# with any error which is retriable, this will impact sending logs to any
# other configured remote Loki servers. Sending is done on a single thread!
# It is generally recommended to run multiple promtail clients in parallel
# if you want to send to multiple remote Loki instances.
clients:
- [<client_config>]

Expand Down
4 changes: 2 additions & 2 deletions pkg/promtail/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
return
}

// Only retry 500s and connection-level errors.
if status > 0 && status/100 != 5 {
// Only retry 429s, 500s and connection-level errors.
if status > 0 && status != 429 && status/100 != 5 {
break
}

Expand Down
29 changes: 29 additions & 0 deletions pkg/promtail/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,35 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 0
`,
},
"do retry sending a batch in case the server responds with a 429": {
clientBatchSize: 10,
clientBatchWait: 10 * time.Millisecond,
clientMaxRetries: 3,
serverResponseStatus: 429,
inputEntries: []entry{logEntries[0]},
expectedReqs: []receivedReq{
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
},
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0
`,
},
"batch log entries together honoring the client tenant ID": {
clientBatchSize: 100,
clientBatchWait: 100 * time.Millisecond,
Expand Down
8 changes: 4 additions & 4 deletions pkg/promtail/client/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ func (c *Config) RegisterFlags(flags *flag.FlagSet) {
flags.Var(&c.URL, "client.url", "URL of log server")
flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.")
flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ")

flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 5, "Maximum number of retires when sending batches.")
flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 100*time.Millisecond, "Initial backoff time between retries.")
flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Second, "Maximum backoff time between retries.")
// Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost
flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 10, "Maximum number of retires when sending batches.")
flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.")
flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.")
flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request")
flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")

Expand Down

0 comments on commit 6841c41

Please sign in to comment.