Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: Improve monitoring BPF maps and userspace caches #1950

Merged
merged 5 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,3 @@ func New(s *server.Server) *Cache {
func Get() *Cache {
return cache
}

func (ec *Cache) len() int {
return len(ec.cache)
}
30 changes: 0 additions & 30 deletions pkg/eventcache/metrics.go

This file was deleted.

8 changes: 4 additions & 4 deletions pkg/metrics/eventcachemetrics/eventcachemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,22 @@ func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(parentInfoErrors)
}

// Get a new handle on an processInfoErrors metric for an eventType
// Get a new handle on a processInfoErrors metric for an eventType
func ProcessInfoError(eventType string) prometheus.Counter {
return processInfoErrors.WithLabelValues(eventType)
}

// Get a new handle on an processInfoErrors metric for an eventType
// Get a new handle on a podInfoErrors metric for an eventType
func PodInfoError(eventType string) prometheus.Counter {
return podInfoErrors.WithLabelValues(eventType)
}

// Get a new handle on an processInfoErrors metric for an eventType
// Get a new handle on an eventCacheErrorsTotal metric for an error
func EventCacheError(err string) prometheus.Counter {
return eventCacheErrorsTotal.WithLabelValues(err)
}

// Get a new handle on the eventCacheRetriesTotal metric for an entryType
// Get a new handle on an eventCacheRetriesTotal metric for an entryType
func EventCacheRetries(entryType string) prometheus.Counter {
return eventCacheRetriesTotal.WithLabelValues(entryType)
}
Expand Down
15 changes: 2 additions & 13 deletions pkg/metrics/mapmetrics/mapmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,22 @@ import (
)

var (
MapDrops = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "map_drops_total",
Help: "The total number of entries dropped per LRU map.",
ConstLabels: nil,
}, []string{"map"})
MapSize = metrics.NewBPFGauge(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use_gauge"),
"The total number of in-use entries per map.",
[]string{"map", "total"}, nil,
))
MapErrors = metrics.NewBPFCounter(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_errors_total"),
"The total number of entries dropped per LRU map.",
"The number of errors per map.",
[]string{"map"}, nil,
))
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(MapDrops)
func InitMetrics(_ *prometheus.Registry) {
// custom collectors are registered independently
}

func MapDropInc(mapName string) {
MapDrops.WithLabelValues(mapName).Inc()
}

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
// NB: We can't register individual BPF collectors collecting map metrics, because they share the
// metrics descriptors. Sending duplicate descriptors from different collectors results in
Expand Down
4 changes: 0 additions & 4 deletions pkg/metrics/metricsconfig/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package metricsconfig

import (
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
Expand All @@ -20,7 +19,6 @@ import (
"github.com/cilium/tetragon/pkg/metrics/syscallmetrics"
"github.com/cilium/tetragon/pkg/metrics/watchermetrics"
"github.com/cilium/tetragon/pkg/observer"
"github.com/cilium/tetragon/pkg/process"
"github.com/cilium/tetragon/pkg/version"
grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
"github.com/prometheus/client_golang/prometheus"
Expand All @@ -46,9 +44,7 @@ func InitAllMetrics(registry *prometheus.Registry) {

// register BPF collectors
registry.MustRegister(mapmetrics.NewBPFCollector(
eventcache.NewBPFCollector(),
observer.NewBPFCollector(),
process.NewBPFCollector(),
))
registry.MustRegister(eventmetrics.NewBPFCollector())

Expand Down
6 changes: 1 addition & 5 deletions pkg/process/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/mapmetrics"
lru "github.com/hashicorp/golang-lru/v2"
)

Expand Down Expand Up @@ -132,7 +131,7 @@ func NewCache(
lruCache, err := lru.NewWithEvict(
processCacheSize,
func(_ string, _ *ProcessInternal) {
mapmetrics.MapDropInc("processLru")
errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted)
sadath-12 marked this conversation as resolved.
Show resolved Hide resolved
},
)
if err != nil {
Expand Down Expand Up @@ -160,9 +159,6 @@ func (pc *Cache) get(processID string) (*ProcessInternal, error) {
// clone or execve events
func (pc *Cache) add(process *ProcessInternal) bool {
evicted := pc.cache.Add(process.process.ExecId, process)
if evicted {
errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted)
}
return evicted
}

Expand Down
29 changes: 9 additions & 20 deletions pkg/process/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,17 @@
package process

import (
"fmt"

"github.com/cilium/tetragon/pkg/metrics/mapmetrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
type bpfCollector struct{}

func NewBPFCollector() prometheus.Collector {
return &bpfCollector{}
}

func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- mapmetrics.MapSize.Desc()
}
var ProcessCacheTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: consts.MetricsNamespace,
Name: "process_cache_size",
Help: "The size of the process cache",
ConstLabels: nil,
})

func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
if procCache != nil {
ch <- mapmetrics.MapSize.MustMetric(
float64(procCache.len()),
"processLru", fmt.Sprint(procCache.size),
)
}
func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(ProcessCacheTotal)
}
3 changes: 3 additions & 0 deletions pkg/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ func InitCache(w watcher.K8sResourceWatcher, size int) error {
func FreeCache() {
procCache.Purge()
procCache = nil
ProcessCacheTotal.Set(0)
}

// GetProcessCopy() duplicates tetragon.Process and returns it
Expand Down Expand Up @@ -462,6 +463,7 @@ func AddExecEvent(event *tetragonAPI.MsgExecveEventUnix) *ProcessInternal {
}

procCache.add(proc)
ProcessCacheTotal.Inc()
return proc
}

Expand All @@ -485,6 +487,7 @@ func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error {

parent.RefInc()
procCache.add(proc)
ProcessCacheTotal.Inc()
return nil
}

Expand Down
Loading