From 326ac7d373263719bc9384a54fa924a8773b74c7 Mon Sep 17 00:00:00 2001 From: sadath-12 Date: Mon, 19 Feb 2024 12:39:30 +0530 Subject: [PATCH 1/5] mapmetrics: Improve help text for tetragon_map_errors_total Signed-off-by: sadath-12 --- pkg/metrics/mapmetrics/mapmetrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metrics/mapmetrics/mapmetrics.go b/pkg/metrics/mapmetrics/mapmetrics.go index b3a9066b20d..fe9da27bc5c 100644 --- a/pkg/metrics/mapmetrics/mapmetrics.go +++ b/pkg/metrics/mapmetrics/mapmetrics.go @@ -23,7 +23,7 @@ var ( )) MapErrors = metrics.NewBPFCounter(prometheus.NewDesc( prometheus.BuildFQName(consts.MetricsNamespace, "", "map_errors_total"), - "The total number of entries dropped per LRU map.", + "The number of errors per map.", []string{"map"}, nil, )) ) From 96a78f89a7970a4a1fb39df2d8e97dd9fde51411 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Fri, 1 Mar 2024 11:32:19 +0000 Subject: [PATCH 2/5] mapmetrics: Remove tetragon_map_drops_total This metric was counting process cache evictions, so effectively duplicating tetragon_errors_total{type="process_cache_evicted"}. Additionally, having it in the mapmetrics package was confusing, as it's monitoring a userspace process cache, not BPF maps. Let's remove it. Co-authored-by: sadath-12 Signed-off-by: sadath-12 Signed-off-by: Anna Kapuscinska --- pkg/metrics/mapmetrics/mapmetrics.go | 13 +------------ pkg/process/cache.go | 6 +----- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/pkg/metrics/mapmetrics/mapmetrics.go b/pkg/metrics/mapmetrics/mapmetrics.go index fe9da27bc5c..0aeb3bc2e83 100644 --- a/pkg/metrics/mapmetrics/mapmetrics.go +++ b/pkg/metrics/mapmetrics/mapmetrics.go @@ -10,12 +10,6 @@ import ( ) var ( - MapDrops = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: consts.MetricsNamespace, - Name: "map_drops_total", - Help: "The total number of entries dropped per LRU map.", - ConstLabels: nil, - }, []string{"map"}) MapSize = metrics.NewBPFGauge(prometheus.NewDesc( prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use_gauge"), "The total number of in-use entries per map.", @@ -28,15 +22,10 @@ var ( )) ) -func InitMetrics(registry *prometheus.Registry) { - registry.MustRegister(MapDrops) +func InitMetrics(_ *prometheus.Registry) { // custom collectors are registered independently } -func MapDropInc(mapName string) { - MapDrops.WithLabelValues(mapName).Inc() -} - // bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. // NB: We can't register individual BPF collectors collecting map metrics, because they share the // metrics descriptors. Sending duplicate descriptors from different collectors results in diff --git a/pkg/process/cache.go b/pkg/process/cache.go index 190a052a3ff..38a16f6c347 100644 --- a/pkg/process/cache.go +++ b/pkg/process/cache.go @@ -11,7 +11,6 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/metrics/errormetrics" - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" lru "github.com/hashicorp/golang-lru/v2" ) @@ -132,7 +131,7 @@ func NewCache( lruCache, err := lru.NewWithEvict( processCacheSize, func(_ string, _ *ProcessInternal) { - mapmetrics.MapDropInc("processLru") + errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted) }, ) if err != nil { @@ -160,9 +159,6 @@ func (pc *Cache) get(processID string) (*ProcessInternal, error) { // clone or execve events func (pc *Cache) add(process *ProcessInternal) bool { evicted := pc.cache.Add(process.process.ExecId, process) - if evicted { - errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted) - } return evicted } From 1215fbc1c83e348c6459374e8d066134d6c08f35 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Fri, 1 Mar 2024 11:33:46 +0000 Subject: [PATCH 3/5] mapmetrics: Remove tetragon_map_in_use_gauge{map="eventcache"} Using the map size metric for reporting the event cache size was confusing, as event cache is not a BPF map. Let's remove it. A metric reporting the event cache size will be added in a follow-up commit. Co-authored-by: sadath-12 Signed-off-by: sadath-12 Signed-off-by: Anna Kapuscinska --- pkg/eventcache/eventcache.go | 4 ---- pkg/eventcache/metrics.go | 30 ------------------------ pkg/metrics/metricsconfig/initmetrics.go | 2 -- 3 files changed, 36 deletions(-) delete mode 100644 pkg/eventcache/metrics.go diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go index 5f5a03649ce..b5488609c44 100644 --- a/pkg/eventcache/eventcache.go +++ b/pkg/eventcache/eventcache.go @@ -247,7 +247,3 @@ func New(s *server.Server) *Cache { func Get() *Cache { return cache } - -func (ec *Cache) len() int { - return len(ec.cache) -} diff --git a/pkg/eventcache/metrics.go b/pkg/eventcache/metrics.go deleted file mode 100644 index 532674d7095..00000000000 --- a/pkg/eventcache/metrics.go +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright Authors of Tetragon - -package eventcache - -import ( - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" - "github.com/prometheus/client_golang/prometheus" -) - -// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. -type bpfCollector struct{} - -func NewBPFCollector() prometheus.Collector { - return &bpfCollector{} -} - -func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- mapmetrics.MapSize.Desc() -} - -func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) { - ec := Get() - if ec != nil { - ch <- mapmetrics.MapSize.MustMetric( - float64(ec.len()), - "eventcache", "0", - ) - } -} diff --git a/pkg/metrics/metricsconfig/initmetrics.go b/pkg/metrics/metricsconfig/initmetrics.go index 4109597f899..b66cb8560bc 100644 --- a/pkg/metrics/metricsconfig/initmetrics.go +++ b/pkg/metrics/metricsconfig/initmetrics.go @@ -4,7 +4,6 @@ package metricsconfig import ( - "github.com/cilium/tetragon/pkg/eventcache" "github.com/cilium/tetragon/pkg/grpc/tracing" "github.com/cilium/tetragon/pkg/metrics/errormetrics" "github.com/cilium/tetragon/pkg/metrics/eventcachemetrics" @@ -46,7 +45,6 @@ func InitAllMetrics(registry *prometheus.Registry) { // register BPF collectors registry.MustRegister(mapmetrics.NewBPFCollector( - eventcache.NewBPFCollector(), observer.NewBPFCollector(), process.NewBPFCollector(), )) From a715405b6e809d3b1399334e414cb7e5b4843909 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Fri, 1 Mar 2024 11:47:29 +0000 Subject: [PATCH 4/5] eventcachemetrics: Fix function doc comments Some of them were incorrect. Co-authored-by: sadath-12 Signed-off-by: sadath-12 Signed-off-by: Anna Kapuscinska --- pkg/metrics/eventcachemetrics/eventcachemetrics.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/metrics/eventcachemetrics/eventcachemetrics.go b/pkg/metrics/eventcachemetrics/eventcachemetrics.go index 669fd61ba81..67400cb3a44 100644 --- a/pkg/metrics/eventcachemetrics/eventcachemetrics.go +++ b/pkg/metrics/eventcachemetrics/eventcachemetrics.go @@ -61,22 +61,22 @@ func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(parentInfoErrors) } -// Get a new handle on an processInfoErrors metric for an eventType +// Get a new handle on a processInfoErrors metric for an eventType func ProcessInfoError(eventType string) prometheus.Counter { return processInfoErrors.WithLabelValues(eventType) } -// Get a new handle on an processInfoErrors metric for an eventType +// Get a new handle on a podInfoErrors metric for an eventType func PodInfoError(eventType string) prometheus.Counter { return podInfoErrors.WithLabelValues(eventType) } -// Get a new handle on an processInfoErrors metric for an eventType +// Get a new handle on an eventCacheErrorsTotal metric for an error func EventCacheError(err string) prometheus.Counter { return eventCacheErrorsTotal.WithLabelValues(err) } -// Get a new handle on the eventCacheRetriesTotal metric for an entryType +// Get a new handle on an eventCacheRetriesTotal metric for an entryType func EventCacheRetries(entryType string) prometheus.Counter { return eventCacheRetriesTotal.WithLabelValues(entryType) } From bcf3ee4d65071a9a554b8436b5cc8e8214b16851 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Fri, 1 Mar 2024 11:48:22 +0000 Subject: [PATCH 5/5] metrics: Replace tetragon_map_in_use_gauge{map="processLru"} Replace tetragon_map_in_use_gauge{map="processLru"} with tetragon_process_cache_size. It's reporting the process cache size. Using the map size metric for this purpose was confusing, as process cache is not a BPF map. Co-authored-by: sadath-12 Signed-off-by: sadath-12 Signed-off-by: Anna Kapuscinska --- pkg/metrics/metricsconfig/initmetrics.go | 2 -- pkg/process/metrics.go | 29 ++++++++---------------- pkg/process/process.go | 3 +++ 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/pkg/metrics/metricsconfig/initmetrics.go b/pkg/metrics/metricsconfig/initmetrics.go index b66cb8560bc..000d19da0d6 100644 --- a/pkg/metrics/metricsconfig/initmetrics.go +++ b/pkg/metrics/metricsconfig/initmetrics.go @@ -19,7 +19,6 @@ import ( "github.com/cilium/tetragon/pkg/metrics/syscallmetrics" "github.com/cilium/tetragon/pkg/metrics/watchermetrics" "github.com/cilium/tetragon/pkg/observer" - "github.com/cilium/tetragon/pkg/process" "github.com/cilium/tetragon/pkg/version" grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" "github.com/prometheus/client_golang/prometheus" @@ -46,7 +45,6 @@ func InitAllMetrics(registry *prometheus.Registry) { // register BPF collectors registry.MustRegister(mapmetrics.NewBPFCollector( observer.NewBPFCollector(), - process.NewBPFCollector(), )) registry.MustRegister(eventmetrics.NewBPFCollector()) diff --git a/pkg/process/metrics.go b/pkg/process/metrics.go index 5d37cb34704..9dc24d4df42 100644 --- a/pkg/process/metrics.go +++ b/pkg/process/metrics.go @@ -4,28 +4,17 @@ package process import ( - "fmt" - - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" + "github.com/cilium/tetragon/pkg/metrics/consts" "github.com/prometheus/client_golang/prometheus" ) -// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. -type bpfCollector struct{} - -func NewBPFCollector() prometheus.Collector { - return &bpfCollector{} -} - -func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- mapmetrics.MapSize.Desc() -} +var ProcessCacheTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: consts.MetricsNamespace, + Name: "process_cache_size", + Help: "The size of the process cache", + ConstLabels: nil, +}) -func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) { - if procCache != nil { - ch <- mapmetrics.MapSize.MustMetric( - float64(procCache.len()), - "processLru", fmt.Sprint(procCache.size), - ) - } +func InitMetrics(registry *prometheus.Registry) { + registry.MustRegister(ProcessCacheTotal) } diff --git a/pkg/process/process.go b/pkg/process/process.go index 57e60f44b49..0bd50ce5670 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -88,6 +88,7 @@ func InitCache(w watcher.K8sResourceWatcher, size int) error { func FreeCache() { procCache.Purge() procCache = nil + ProcessCacheTotal.Set(0) } // GetProcessCopy() duplicates tetragon.Process and returns it @@ -462,6 +463,7 @@ func AddExecEvent(event *tetragonAPI.MsgExecveEventUnix) *ProcessInternal { } procCache.add(proc) + ProcessCacheTotal.Inc() return proc } @@ -485,6 +487,7 @@ func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error { parent.RefInc() procCache.add(proc) + ProcessCacheTotal.Inc() return nil }