From 3fa3d38d8ae85953ca9d3981d6b4bc338e74ffc9 Mon Sep 17 00:00:00 2001 From: sadath-12 Date: Tue, 9 Jan 2024 17:29:01 +0530 Subject: [PATCH] Fix: monitoring BPF maps solving all monitoring issues exposed at issue #1774 Signed-off-by: sadath-12 --- pkg/eventcache/eventcache.go | 4 --- pkg/eventcache/metrics.go | 30 ------------------ .../eventcachemetrics/eventcachemetrics.go | 13 ++++++++ pkg/metrics/mapmetrics/mapmetrics.go | 17 ++-------- pkg/metrics/metricsconfig/initmetrics.go | 4 --- pkg/observer/observer_stats.go | 3 +- pkg/process/cache.go | 3 +- pkg/process/metrics.go | 31 ------------------- pkg/process/process.go | 11 ++++--- 9 files changed, 25 insertions(+), 91 deletions(-) delete mode 100644 pkg/eventcache/metrics.go delete mode 100644 pkg/process/metrics.go diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go index 9755bf686d9..507ebabc612 100644 --- a/pkg/eventcache/eventcache.go +++ b/pkg/eventcache/eventcache.go @@ -246,7 +246,3 @@ func New(s *server.Server) *Cache { func Get() *Cache { return cache } - -func (ec *Cache) len() int { - return len(ec.cache) -} diff --git a/pkg/eventcache/metrics.go b/pkg/eventcache/metrics.go deleted file mode 100644 index 532674d7095..00000000000 --- a/pkg/eventcache/metrics.go +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright Authors of Tetragon - -package eventcache - -import ( - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" - "github.com/prometheus/client_golang/prometheus" -) - -// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. -type bpfCollector struct{} - -func NewBPFCollector() prometheus.Collector { - return &bpfCollector{} -} - -func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- mapmetrics.MapSize.Desc() -} - -func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) { - ec := Get() - if ec != nil { - ch <- mapmetrics.MapSize.MustMetric( - float64(ec.len()), - "eventcache", "0", - ) - } -} diff --git a/pkg/metrics/eventcachemetrics/eventcachemetrics.go b/pkg/metrics/eventcachemetrics/eventcachemetrics.go index 9fa660b5a24..2bb6436f0fa 100644 --- a/pkg/metrics/eventcachemetrics/eventcachemetrics.go +++ b/pkg/metrics/eventcachemetrics/eventcachemetrics.go @@ -8,6 +8,8 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +var ProcessCacheSize = "process_cache_size" + var ( processInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, @@ -33,6 +35,12 @@ var ( Help: "The total of errors encountered while fetching process exec information from the cache.", ConstLabels: nil, }, []string{"error"}) + ProcessCacheTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: consts.MetricsNamespace, + Name: "event_cache_process_total", + Help: "The size of the process cache.", + ConstLabels: nil, + }, []string{"type"}) ) func InitMetrics(registry *prometheus.Registry) { @@ -56,3 +64,8 @@ func PodInfoError(eventType string) prometheus.Counter { func EventCacheError(err string) prometheus.Counter { return eventCacheErrorsTotal.WithLabelValues(err) } + +// Get a new handle on an ProcessCache metric for an entryType +func ProcessCache(entryType string) prometheus.Counter { + return ProcessCacheTotal.WithLabelValues(entryType) +} diff --git a/pkg/metrics/mapmetrics/mapmetrics.go b/pkg/metrics/mapmetrics/mapmetrics.go index b3a9066b20d..2be73887c20 100644 --- a/pkg/metrics/mapmetrics/mapmetrics.go +++ b/pkg/metrics/mapmetrics/mapmetrics.go @@ -10,33 +10,22 @@ import ( ) var ( - MapDrops = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: consts.MetricsNamespace, - Name: "map_drops_total", - Help: "The total number of entries dropped per LRU map.", - ConstLabels: nil, - }, []string{"map"}) MapSize = metrics.NewBPFGauge(prometheus.NewDesc( - prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use_gauge"), + prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use"), "The total number of in-use entries per map.", []string{"map", "total"}, nil, )) MapErrors = metrics.NewBPFCounter(prometheus.NewDesc( prometheus.BuildFQName(consts.MetricsNamespace, "", "map_errors_total"), - "The total number of entries dropped per LRU map.", + "The entries dropped per LRU map.", []string{"map"}, nil, )) ) -func InitMetrics(registry *prometheus.Registry) { - registry.MustRegister(MapDrops) +func InitMetrics(_ *prometheus.Registry) { // custom collectors are registered independently } -func MapDropInc(mapName string) { - MapDrops.WithLabelValues(mapName).Inc() -} - // bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. // NB: We can't register individual BPF collectors collecting map metrics, because they share the // metrics descriptors. Sending duplicate descriptors from different collectors results in diff --git a/pkg/metrics/metricsconfig/initmetrics.go b/pkg/metrics/metricsconfig/initmetrics.go index 4109597f899..000d19da0d6 100644 --- a/pkg/metrics/metricsconfig/initmetrics.go +++ b/pkg/metrics/metricsconfig/initmetrics.go @@ -4,7 +4,6 @@ package metricsconfig import ( - "github.com/cilium/tetragon/pkg/eventcache" "github.com/cilium/tetragon/pkg/grpc/tracing" "github.com/cilium/tetragon/pkg/metrics/errormetrics" "github.com/cilium/tetragon/pkg/metrics/eventcachemetrics" @@ -20,7 +19,6 @@ import ( "github.com/cilium/tetragon/pkg/metrics/syscallmetrics" "github.com/cilium/tetragon/pkg/metrics/watchermetrics" "github.com/cilium/tetragon/pkg/observer" - "github.com/cilium/tetragon/pkg/process" "github.com/cilium/tetragon/pkg/version" grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" "github.com/prometheus/client_golang/prometheus" @@ -46,9 +44,7 @@ func InitAllMetrics(registry *prometheus.Registry) { // register BPF collectors registry.MustRegister(mapmetrics.NewBPFCollector( - eventcache.NewBPFCollector(), observer.NewBPFCollector(), - process.NewBPFCollector(), )) registry.MustRegister(eventmetrics.NewBPFCollector()) diff --git a/pkg/observer/observer_stats.go b/pkg/observer/observer_stats.go index bdf51c772ed..fa41579ff39 100644 --- a/pkg/observer/observer_stats.go +++ b/pkg/observer/observer_stats.go @@ -30,9 +30,8 @@ func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) { for _, m := range sensors.AllMaps { name := m.Name pin := filepath.Join(option.Config.MapDir, name) - pinStats := pin + "_stats" - mapLinkStats, err := ebpf.LoadPinnedMap(pinStats, nil) + mapLinkStats, err := ebpf.LoadPinnedMap(pin, nil) if err != nil { return } diff --git a/pkg/process/cache.go b/pkg/process/cache.go index 190a052a3ff..081da9444e3 100644 --- a/pkg/process/cache.go +++ b/pkg/process/cache.go @@ -11,7 +11,6 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/metrics/errormetrics" - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" lru "github.com/hashicorp/golang-lru/v2" ) @@ -132,7 +131,7 @@ func NewCache( lruCache, err := lru.NewWithEvict( processCacheSize, func(_ string, _ *ProcessInternal) { - mapmetrics.MapDropInc("processLru") + errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted) }, ) if err != nil { diff --git a/pkg/process/metrics.go b/pkg/process/metrics.go deleted file mode 100644 index 5d37cb34704..00000000000 --- a/pkg/process/metrics.go +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright Authors of Tetragon - -package process - -import ( - "fmt" - - "github.com/cilium/tetragon/pkg/metrics/mapmetrics" - "github.com/prometheus/client_golang/prometheus" -) - -// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps. -type bpfCollector struct{} - -func NewBPFCollector() prometheus.Collector { - return &bpfCollector{} -} - -func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- mapmetrics.MapSize.Desc() -} - -func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) { - if procCache != nil { - ch <- mapmetrics.MapSize.MustMetric( - float64(procCache.len()), - "processLru", fmt.Sprint(procCache.size), - ) - } -} diff --git a/pkg/process/process.go b/pkg/process/process.go index db9e82a9bbf..7af3e8caf0f 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -12,6 +12,7 @@ import ( "sync/atomic" "github.com/cilium/tetragon/pkg/metrics/errormetrics" + "github.com/cilium/tetragon/pkg/metrics/eventcachemetrics" hubble "github.com/cilium/tetragon/pkg/oldhubble/cilium" "github.com/sirupsen/logrus" @@ -61,9 +62,7 @@ var ( k8s watcher.K8sResourceWatcher ) -var ( - ErrProcessInfoMissing = errors.New("failed process info missing") -) +var ErrProcessInfoMissing = errors.New("failed process info missing") func InitCache(w watcher.K8sResourceWatcher, size int) error { var err error @@ -88,6 +87,7 @@ func InitCache(w watcher.K8sResourceWatcher, size int) error { func FreeCache() { procCache.Purge() procCache = nil + eventcachemetrics.ProcessCacheTotal.Reset() } // GetProcessCopy() duplicates tetragon.Process and returns it @@ -355,7 +355,8 @@ func initProcessInternalExec( // initProcessInternalClone() initialize and returns ProcessInternal from // a clone event func initProcessInternalClone(event *tetragonAPI.MsgCloneEvent, - parent *ProcessInternal, parentExecId string) (*ProcessInternal, error) { + parent *ProcessInternal, parentExecId string, +) (*ProcessInternal, error) { pi := parent.cloneInternalProcessCopy() if pi.process == nil { err := fmt.Errorf("failed to clone parent process from cache") @@ -440,6 +441,7 @@ func AddExecEvent(event *tetragonAPI.MsgExecveEventUnix) *ProcessInternal { } procCache.add(proc) + eventcachemetrics.ProcessCache(eventcachemetrics.ProcessCacheSize).Inc() return proc } @@ -463,6 +465,7 @@ func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error { parent.RefInc() procCache.add(proc) + eventcachemetrics.ProcessCache(eventcachemetrics.ProcessCacheSize).Inc() return nil }