Skip to content

Commit

Permalink
Fix: monitoring BPF maps
Browse files Browse the repository at this point in the history
solving all monitoring issues exposed at issue cilium#1774

Signed-off-by: sadath-12 <sadathsadu2002@gmail.com>
  • Loading branch information
sadath-12 committed Jan 9, 2024
1 parent 8802fd6 commit 3fa3d38
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 91 deletions.
4 changes: 0 additions & 4 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,3 @@ func New(s *server.Server) *Cache {
func Get() *Cache {
return cache
}

func (ec *Cache) len() int {
return len(ec.cache)
}
30 changes: 0 additions & 30 deletions pkg/eventcache/metrics.go

This file was deleted.

13 changes: 13 additions & 0 deletions pkg/metrics/eventcachemetrics/eventcachemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

var ProcessCacheSize = "process_cache_size"

var (
processInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Expand All @@ -33,6 +35,12 @@ var (
Help: "The total of errors encountered while fetching process exec information from the cache.",
ConstLabels: nil,
}, []string{"error"})
ProcessCacheTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "event_cache_process_total",
Help: "The size of the process cache.",
ConstLabels: nil,
}, []string{"type"})
)

func InitMetrics(registry *prometheus.Registry) {
Expand All @@ -56,3 +64,8 @@ func PodInfoError(eventType string) prometheus.Counter {
func EventCacheError(err string) prometheus.Counter {
return eventCacheErrorsTotal.WithLabelValues(err)
}

// Get a new handle on an ProcessCache metric for an entryType
func ProcessCache(entryType string) prometheus.Counter {
return ProcessCacheTotal.WithLabelValues(entryType)
}
17 changes: 3 additions & 14 deletions pkg/metrics/mapmetrics/mapmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,22 @@ import (
)

var (
MapDrops = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "map_drops_total",
Help: "The total number of entries dropped per LRU map.",
ConstLabels: nil,
}, []string{"map"})
MapSize = metrics.NewBPFGauge(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use_gauge"),
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use"),
"The total number of in-use entries per map.",
[]string{"map", "total"}, nil,
))
MapErrors = metrics.NewBPFCounter(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_errors_total"),
"The total number of entries dropped per LRU map.",
"The entries dropped per LRU map.",
[]string{"map"}, nil,
))
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(MapDrops)
func InitMetrics(_ *prometheus.Registry) {
// custom collectors are registered independently
}

func MapDropInc(mapName string) {
MapDrops.WithLabelValues(mapName).Inc()
}

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
// NB: We can't register individual BPF collectors collecting map metrics, because they share the
// metrics descriptors. Sending duplicate descriptors from different collectors results in
Expand Down
4 changes: 0 additions & 4 deletions pkg/metrics/metricsconfig/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package metricsconfig

import (
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
Expand All @@ -20,7 +19,6 @@ import (
"github.com/cilium/tetragon/pkg/metrics/syscallmetrics"
"github.com/cilium/tetragon/pkg/metrics/watchermetrics"
"github.com/cilium/tetragon/pkg/observer"
"github.com/cilium/tetragon/pkg/process"
"github.com/cilium/tetragon/pkg/version"
grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
"github.com/prometheus/client_golang/prometheus"
Expand All @@ -46,9 +44,7 @@ func InitAllMetrics(registry *prometheus.Registry) {

// register BPF collectors
registry.MustRegister(mapmetrics.NewBPFCollector(
eventcache.NewBPFCollector(),
observer.NewBPFCollector(),
process.NewBPFCollector(),
))
registry.MustRegister(eventmetrics.NewBPFCollector())

Expand Down
3 changes: 1 addition & 2 deletions pkg/observer/observer_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
for _, m := range sensors.AllMaps {
name := m.Name
pin := filepath.Join(option.Config.MapDir, name)
pinStats := pin + "_stats"

mapLinkStats, err := ebpf.LoadPinnedMap(pinStats, nil)
mapLinkStats, err := ebpf.LoadPinnedMap(pin, nil)
if err != nil {
return
}
Expand Down
3 changes: 1 addition & 2 deletions pkg/process/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/mapmetrics"
lru "github.com/hashicorp/golang-lru/v2"
)

Expand Down Expand Up @@ -132,7 +131,7 @@ func NewCache(
lruCache, err := lru.NewWithEvict(
processCacheSize,
func(_ string, _ *ProcessInternal) {
mapmetrics.MapDropInc("processLru")
errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted)
},
)
if err != nil {
Expand Down
31 changes: 0 additions & 31 deletions pkg/process/metrics.go

This file was deleted.

11 changes: 7 additions & 4 deletions pkg/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sync/atomic"

"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
hubble "github.com/cilium/tetragon/pkg/oldhubble/cilium"
"github.com/sirupsen/logrus"

Expand Down Expand Up @@ -61,9 +62,7 @@ var (
k8s watcher.K8sResourceWatcher
)

var (
ErrProcessInfoMissing = errors.New("failed process info missing")
)
var ErrProcessInfoMissing = errors.New("failed process info missing")

func InitCache(w watcher.K8sResourceWatcher, size int) error {
var err error
Expand All @@ -88,6 +87,7 @@ func InitCache(w watcher.K8sResourceWatcher, size int) error {
func FreeCache() {
procCache.Purge()
procCache = nil
eventcachemetrics.ProcessCacheTotal.Reset()
}

// GetProcessCopy() duplicates tetragon.Process and returns it
Expand Down Expand Up @@ -355,7 +355,8 @@ func initProcessInternalExec(
// initProcessInternalClone() initialize and returns ProcessInternal from
// a clone event
func initProcessInternalClone(event *tetragonAPI.MsgCloneEvent,
parent *ProcessInternal, parentExecId string) (*ProcessInternal, error) {
parent *ProcessInternal, parentExecId string,
) (*ProcessInternal, error) {
pi := parent.cloneInternalProcessCopy()
if pi.process == nil {
err := fmt.Errorf("failed to clone parent process from cache")
Expand Down Expand Up @@ -440,6 +441,7 @@ func AddExecEvent(event *tetragonAPI.MsgExecveEventUnix) *ProcessInternal {
}

procCache.add(proc)
eventcachemetrics.ProcessCache(eventcachemetrics.ProcessCacheSize).Inc()
return proc
}

Expand All @@ -463,6 +465,7 @@ func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error {

parent.RefInc()
procCache.add(proc)
eventcachemetrics.ProcessCache(eventcachemetrics.ProcessCacheSize).Inc()
return nil
}

Expand Down

0 comments on commit 3fa3d38

Please sign in to comment.