diff --git a/docs/clients/promtail/configuration.md b/docs/clients/promtail/configuration.md index a80e1ae0bac67..fd28b561dea86 100644 --- a/docs/clients/promtail/configuration.md +++ b/docs/clients/promtail/configuration.md @@ -68,6 +68,11 @@ Supported contents and default values of `config.yaml`: # Describes how Promtail connects to multiple instances # of Loki, sending logs to each. +# WARNING: If one of the remote Loki servers fails to respond or responds +# with any error which is retriable, this will impact sending logs to any +# other configured remote Loki servers. Sending is done on a single thread! +# It is generally recommended to run multiple promtail clients in parallel +# if you want to send to multiple remote Loki instances. clients: - [<client_config>] diff --git a/pkg/promtail/client/client.go b/pkg/promtail/client/client.go index 8b24324d3cdd1..38fabac6165e4 100644 --- a/pkg/promtail/client/client.go +++ b/pkg/promtail/client/client.go @@ -234,8 +234,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) { return } - // Only retry 500s and connection-level errors. - if status > 0 && status/100 != 5 { + // Only retry 429s, 500s and connection-level errors. + if status > 0 && status != 429 && status/100 != 5 { break } diff --git a/pkg/promtail/client/client_test.go b/pkg/promtail/client/client_test.go index 439f2ece5be07..6fb4942bbe5c5 100644 --- a/pkg/promtail/client/client_test.go +++ b/pkg/promtail/client/client_test.go @@ -152,6 +152,35 @@ func TestClient_Handle(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 0 `, }, + "do retry sending a batch in case the server responds with a 429": { + clientBatchSize: 10, + clientBatchWait: 10 * time.Millisecond, + clientMaxRetries: 3, + serverResponseStatus: 429, + inputEntries: []entry{logEntries[0]}, + expectedReqs: []receivedReq{ + { + tenantID: "", + pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}}, + }, + { + tenantID: "", + pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}}, + }, + { + tenantID: "", + pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}}, + }, + }, + expectedMetrics: ` + # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. + # TYPE promtail_dropped_entries_total counter + promtail_dropped_entries_total{host="__HOST__"} 1.0 + # HELP promtail_sent_entries_total Number of log entries sent to the ingester. + # TYPE promtail_sent_entries_total counter + promtail_sent_entries_total{host="__HOST__"} 0 + `, + }, "batch log entries together honoring the client tenant ID": { clientBatchSize: 100, clientBatchWait: 100 * time.Millisecond, diff --git a/pkg/promtail/client/config.go b/pkg/promtail/client/config.go index 5b7cdf6029172..2908130480421 100644 --- a/pkg/promtail/client/config.go +++ b/pkg/promtail/client/config.go @@ -34,10 +34,10 @@ func (c *Config) RegisterFlags(flags *flag.FlagSet) { flags.Var(&c.URL, "client.url", "URL of log server") flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.") flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ") - - flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 5, "Maximum number of retires when sending batches.") - flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 100*time.Millisecond, "Initial backoff time between retries.") - flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Second, "Maximum backoff time between retries.") + // Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost + flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 10, "Maximum number of retires when sending batches.") + flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.") + flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.") flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request") flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")