From 17121779f12f1b5f8dc5ba1a2c6a5c22981fd692 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 29 Jan 2025 00:41:19 -0500 Subject: [PATCH] Improve PostingsForMatchers cache observability (#10525) * Improve PostingsForMatchers cache observability Signed-off-by: Marco Pracucci * Fixed unit tests Signed-off-by: Marco Pracucci --------- Signed-off-by: Marco Pracucci --- CHANGELOG.md | 4 +- go.mod | 2 +- go.sum | 4 +- pkg/ingester/metrics_test.go | 28 +++++ .../tsdb/postings_for_matchers_cache.go | 107 +++++++++++++++--- vendor/modules.txt | 4 +- 6 files changed, 128 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c77a707950..4e9c288256 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,15 +27,17 @@ * [ENHANCEMENT] Ingester: Hide tokens in ingester ring status page when ingest storage is enabled #10399 * [ENHANCEMENT] Ingester: add `active_series_additional_custom_trackers` configuration, in addition to the already existing `active_series_custom_trackers`. The `active_series_additional_custom_trackers` configuration allows you to configure additional custom trackers that get merged with `active_series_custom_trackers` at runtime. #10428 * [ENHANCEMENT] Query-frontend: Allow blocking raw http requests with the `blocked_requests` configuration. Requests can be blocked based on their path, method or query parameters #10484 -* [ENHANCEMENT] Ingester: Added the following metrics exported by `PostingsForMatchers` cache: #10500 +* [ENHANCEMENT] Ingester: Added the following metrics exported by `PostingsForMatchers` cache: #10500 #10525 * `cortex_ingester_tsdb_head_postings_for_matchers_cache_hits_total` * `cortex_ingester_tsdb_head_postings_for_matchers_cache_misses_total` * `cortex_ingester_tsdb_head_postings_for_matchers_cache_requests_total` * `cortex_ingester_tsdb_head_postings_for_matchers_cache_skips_total` + * `cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total` * `cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total` * `cortex_ingester_tsdb_block_postings_for_matchers_cache_misses_total` * `cortex_ingester_tsdb_block_postings_for_matchers_cache_requests_total` * `cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total` + * `cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total` * [ENHANCEMENT] Compactor: Shuffle users' order in `BlocksCleaner`. Prevents bucket indexes from going an extended period without cleanup during compactor restarts. #10513 * [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185 * [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154 diff --git a/go.mod b/go.mod index f1f96bf0a0..84a05ee26e 100644 --- a/go.mod +++ b/go.mod @@ -288,7 +288,7 @@ require ( sigs.k8s.io/yaml v1.4.0 // indirect ) -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20250123075837-0cc2978b5013 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20250128175949-8d210000df3a // Replace memberlist with our fork which includes some fixes that haven't been // merged upstream yet: diff --git a/go.sum b/go.sum index da5f6bf168..d564af85b4 100644 --- a/go.sum +++ b/go.sum @@ -1283,8 +1283,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/grafana/mimir-prometheus v0.0.0-20250123075837-0cc2978b5013 h1:70NFJ8OVRMCPc89vN520cTJd0vo/elnaXoF7q0I6c2M= -github.com/grafana/mimir-prometheus v0.0.0-20250123075837-0cc2978b5013/go.mod h1:KfyZCeyGxf5gvl6VZbrQsd400nJjGw+ygMEtDVZKIT4= +github.com/grafana/mimir-prometheus v0.0.0-20250128175949-8d210000df3a h1:4dVgKpv9kmX/HHYBVayiZBvN4ibLVtetZWVY1GCMhL8= +github.com/grafana/mimir-prometheus v0.0.0-20250128175949-8d210000df3a/go.mod h1:KfyZCeyGxf5gvl6VZbrQsd400nJjGw+ygMEtDVZKIT4= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw= diff --git a/pkg/ingester/metrics_test.go b/pkg/ingester/metrics_test.go index 23c479aa26..98364c2e4e 100644 --- a/pkg/ingester/metrics_test.go +++ b/pkg/ingester/metrics_test.go @@ -258,6 +258,13 @@ func TestTSDBMetrics(t *testing.T) { cortex_ingester_tsdb_head_postings_for_matchers_cache_skips_total{reason="ineligible"} 0 cortex_ingester_tsdb_head_postings_for_matchers_cache_skips_total{reason="stale-cached-entry"} 0 + # HELP cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total Total number of evictions from the PostingsForMatchers cache. + # TYPE cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total counter + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="max-bytes-reached"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 + # HELP cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total Total number of postings lists returned from the PostingsForMatchers cache. # TYPE cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total counter cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total 0 @@ -275,6 +282,13 @@ func TestTSDBMetrics(t *testing.T) { cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="canceled-cached-entry"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="ineligible"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="stale-cached-entry"} 0 + + # HELP cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total Total number of evictions from the PostingsForMatchers cache. + # TYPE cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total counter + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-bytes-reached"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 `)) require.NoError(t, err) } @@ -512,6 +526,13 @@ func TestTSDBMetricsWithRemoval(t *testing.T) { cortex_ingester_tsdb_head_postings_for_matchers_cache_skips_total{reason="ineligible"} 0 cortex_ingester_tsdb_head_postings_for_matchers_cache_skips_total{reason="stale-cached-entry"} 0 + # HELP cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total Total number of evictions from the PostingsForMatchers cache. + # TYPE cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total counter + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="max-bytes-reached"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 + cortex_ingester_tsdb_head_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 + # HELP cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total Total number of postings lists returned from the PostingsForMatchers cache. # TYPE cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total counter cortex_ingester_tsdb_block_postings_for_matchers_cache_hits_total 0 @@ -529,6 +550,13 @@ func TestTSDBMetricsWithRemoval(t *testing.T) { cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="canceled-cached-entry"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="ineligible"} 0 cortex_ingester_tsdb_block_postings_for_matchers_cache_skips_total{reason="stale-cached-entry"} 0 + + # HELP cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total Total number of evictions from the PostingsForMatchers cache. + # TYPE cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total counter + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-bytes-reached"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0 + cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0 `)) require.NoError(t, err) } diff --git a/vendor/github.com/prometheus/prometheus/tsdb/postings_for_matchers_cache.go b/vendor/github.com/prometheus/prometheus/tsdb/postings_for_matchers_cache.go index e82134eb71..b498eff19f 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/postings_for_matchers_cache.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/postings_for_matchers_cache.go @@ -31,6 +31,14 @@ const ( DefaultPostingsForMatchersCacheForce = false ) +const ( + evictionReasonTTL = iota + evictionReasonMaxBytes + evictionReasonMaxItems + evictionReasonUnknown + evictionReasonsLength +) + // IndexPostingsReader is a subset of IndexReader methods, the minimum required to evaluate PostingsForMatchers. type IndexPostingsReader interface { // LabelValues returns possible label values which may not be sorted. @@ -179,6 +187,12 @@ func (p *postingsForMatcherPromise) result(ctx context.Context) (index.Postings, if p.err != nil { return nil, fmt.Errorf("postingsForMatchers promise completed with error: %w", p.err) } + + trace.SpanFromContext(ctx).AddEvent("completed postingsForMatchers promise", trace.WithAttributes( + // Do not format the timestamp to introduce a performance regression. + attribute.Int64("evaluation completed at (epoch seconds)", p.evaluationCompletedAt.Unix()), + )) + return p.cloner.Clone(), nil } } @@ -256,7 +270,7 @@ func (c *PostingsForMatchersCache) postingsForMatchersPromise(ctx context.Contex } c.metrics.hits.Inc() - span.AddEvent("using cached postingsForMatchers promise", trace.WithAttributes( + span.AddEvent("waiting cached postingsForMatchers promise", trace.WithAttributes( attribute.String("cache_key", key), )) @@ -316,7 +330,7 @@ func (c *PostingsForMatchersCache) expire() { defer c.expireInProgress.Store(false) c.cachedMtx.RLock() - if !c.shouldEvictHead() { + if !c.shouldEvictHead(c.timeNow()) { c.cachedMtx.RUnlock() return } @@ -327,18 +341,32 @@ func (c *PostingsForMatchersCache) expire() { c.evictHeadBeforeHook() } - c.cachedMtx.Lock() - defer c.cachedMtx.Unlock() + var evictionReasons [evictionReasonsLength]int - for c.shouldEvictHead() { - c.evictHead() + // Evict the head taking an exclusive lock. + { + c.cachedMtx.Lock() + + now := c.timeNow() + for c.shouldEvictHead(now) { + reason := c.evictHead(now) + evictionReasons[reason]++ + } + + c.cachedMtx.Unlock() } + + // Keep track of the reason why items where evicted. + c.metrics.evictionsBecauseTTL.Add(float64(evictionReasons[evictionReasonTTL])) + c.metrics.evictionsBecauseMaxBytes.Add(float64(evictionReasons[evictionReasonMaxBytes])) + c.metrics.evictionsBecauseMaxItems.Add(float64(evictionReasons[evictionReasonMaxItems])) + c.metrics.evictionsBecauseUnknown.Add(float64(evictionReasons[evictionReasonUnknown])) } // shouldEvictHead returns true if cache head should be evicted, either because it's too old, // or because the cache has too many elements // should be called while read lock is held on cachedMtx. -func (c *PostingsForMatchersCache) shouldEvictHead() bool { +func (c *PostingsForMatchersCache) shouldEvictHead(now time.Time) bool { // The cache should be evicted for sure if the max size (either items or bytes) is reached. if c.cached.Len() > c.maxItems || c.cachedBytes > c.maxBytes { return true @@ -349,15 +377,37 @@ func (c *PostingsForMatchersCache) shouldEvictHead() bool { return false } ts := h.Value.(*postingsForMatchersCachedCall).ts - return c.timeNow().Sub(ts) >= c.ttl + return now.Sub(ts) >= c.ttl } -func (c *PostingsForMatchersCache) evictHead() { +func (c *PostingsForMatchersCache) evictHead(now time.Time) (reason int) { front := c.cached.Front() oldest := front.Value.(*postingsForMatchersCachedCall) + + // Detect the reason why we're evicting it. + // + // The checks order is: + // 1. TTL: if an item is expired, it should be tracked as such even if the cache was full. + // 2. Max bytes: "max items" is deprecated, and we expect to set it to a high value because + // we want to primarily limit by bytes size. + // 3. Max items: the last one. + switch { + case now.Sub(oldest.ts) >= c.ttl: + reason = evictionReasonTTL + case c.cachedBytes > c.maxBytes: + reason = evictionReasonMaxBytes + case c.cached.Len() > c.maxItems: + reason = evictionReasonMaxItems + default: + // This should never happen, but we track it to detect unexpected behaviours. + reason = evictionReasonUnknown + } + c.calls.Delete(oldest.key) c.cached.Remove(front) c.cachedBytes -= oldest.sizeBytes + + return } // onPromiseExecutionDone must be called once the execution of PostingsForMatchers promise has done. @@ -555,18 +605,25 @@ func (t *contextsTracker) trackedContextsCount() int { } type PostingsForMatchersCacheMetrics struct { - requests prometheus.Counter - hits prometheus.Counter - misses prometheus.Counter - skipsBecauseIneligible prometheus.Counter - skipsBecauseStale prometheus.Counter - skipsBecauseCanceled prometheus.Counter + requests prometheus.Counter + hits prometheus.Counter + misses prometheus.Counter + skipsBecauseIneligible prometheus.Counter + skipsBecauseStale prometheus.Counter + skipsBecauseCanceled prometheus.Counter + evictionsBecauseTTL prometheus.Counter + evictionsBecauseMaxBytes prometheus.Counter + evictionsBecauseMaxItems prometheus.Counter + evictionsBecauseUnknown prometheus.Counter } func NewPostingsForMatchersCacheMetrics(reg prometheus.Registerer) *PostingsForMatchersCacheMetrics { const ( skipsMetric = "postings_for_matchers_cache_skips_total" skipsHelp = "Total number of requests to the PostingsForMatchers cache that have been skipped the cache. The subsequent result is not cached." + + evictionsMetric = "postings_for_matchers_cache_evictions_total" + evictionsHelp = "Total number of evictions from the PostingsForMatchers cache." ) return &PostingsForMatchersCacheMetrics{ @@ -597,5 +654,25 @@ func NewPostingsForMatchersCacheMetrics(reg prometheus.Registerer) *PostingsForM Help: skipsHelp, ConstLabels: map[string]string{"reason": "canceled-cached-entry"}, }), + evictionsBecauseTTL: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: evictionsMetric, + Help: evictionsHelp, + ConstLabels: map[string]string{"reason": "ttl-expired"}, + }), + evictionsBecauseMaxBytes: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: evictionsMetric, + Help: evictionsHelp, + ConstLabels: map[string]string{"reason": "max-bytes-reached"}, + }), + evictionsBecauseMaxItems: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: evictionsMetric, + Help: evictionsHelp, + ConstLabels: map[string]string{"reason": "max-items-reached"}, + }), + evictionsBecauseUnknown: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: evictionsMetric, + Help: evictionsHelp, + ConstLabels: map[string]string{"reason": "unknown"}, + }), } } diff --git a/vendor/modules.txt b/vendor/modules.txt index a5363da96a..dd1a96610a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1046,7 +1046,7 @@ github.com/prometheus/exporter-toolkit/web github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20250123075837-0cc2978b5013 +# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20250128175949-8d210000df3a ## explicit; go 1.22.7 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -1710,7 +1710,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20250123075837-0cc2978b5013 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20250128175949-8d210000df3a # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b