Skip to content

Commit

Permalink
metrics: Delete metrics for deleted pods
Browse files Browse the repository at this point in the history
Some of the exposed metrics have "pod" label, which contains the name of the
monitored pod. So far when a pod got deleted, Tetragon kept exposing stale
metrics for it. This was causing continuous increase in memory usage in
Tetragon agent as well as in the metrics scraper.

This commit fixes the issue. Now if metrics and k8s API are both enabled then
an additional pod hook gets registered that on pod deletion deletes metrics
associated with it.

Signed-off-by: Anna Kapuscinska <anna@isovalent.com>
  • Loading branch information
lambdanis committed Aug 30, 2023
1 parent 9e168ce commit 16a9408
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cmd/tetragon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ func tetragonExecute() error {

if option.Config.MetricsServer != "" {
go metrics.EnableMetrics(option.Config.MetricsServer)
// Handler must be registered before the watcher is started
metrics.RegisterPodDeleteHandler()
}

// Probe runtime configuration and do not fail on errors
Expand Down
8 changes: 8 additions & 0 deletions pkg/metrics/eventmetrics/eventmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(policyStats)
}

// ListMetricsWithPod returns a list of metrics with "pod" and "namespace" labels.
func ListMetricsWithPod() []*prometheus.MetricVec {
return []*prometheus.MetricVec{
EventsProcessed.MetricVec,
policyStats.MetricVec,
}
}

func GetProcessInfo(process *tetragon.Process) (binary, pod, namespace string) {
if process != nil {
binary = process.Binary
Expand Down
62 changes: 62 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package metrics

import (
"net/http"
"sync"

"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/logger"
Expand All @@ -20,10 +21,71 @@ import (
"github.com/cilium/tetragon/pkg/metrics/syscallmetrics"
"github.com/cilium/tetragon/pkg/metrics/watchermetrics"
"github.com/cilium/tetragon/pkg/observer"
"github.com/cilium/tetragon/pkg/podhooks"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/cache"
)

var (
metricsWithPod []*prometheus.MetricVec
once sync.Once
)

// RegisterPodDeleteHandler registers handler for deleting metrics associated
// with deleted pods. Without it, Tetragon kept exposing stale metrics for
// deleted pods. This was causing continuous increase in memory usage in
// Tetragon agent as well as in the metrics scraper.
func RegisterPodDeleteHandler() {
logger.GetLogger().Info("Registering pod delete handler for metrics")
podhooks.RegisterCallbacksAtInit(podhooks.Callbacks{
PodCallbacks: func(podInformer cache.SharedIndexInformer) {
podInformer.AddEventHandler(
cache.ResourceEventHandlerFuncs{
DeleteFunc: func(obj interface{}) {
var pod *corev1.Pod
switch concreteObj := obj.(type) {
case *corev1.Pod:
pod = concreteObj
case cache.DeletedFinalStateUnknown:
// Handle the case when the watcher missed the deletion event
// (e.g. due to a lost apiserver connection).
deletedObj, ok := concreteObj.Obj.(*corev1.Pod)
if !ok {
return
}
pod = deletedObj
default:
return
}
DeleteMetricsForPod(pod)
},
},
)
},
})
}

// ListMetricsWithPod returns the global list of all metrics that have "pod"
// and "namespace" labels, initializing it if needed.
func ListMetricsWithPod() []*prometheus.MetricVec {
once.Do(func() {
metricsWithPod = append(metricsWithPod, eventmetrics.ListMetricsWithPod()...)
metricsWithPod = append(metricsWithPod, syscallmetrics.ListMetricsWithPod()...)
})
return metricsWithPod
}

func DeleteMetricsForPod(pod *corev1.Pod) {
for _, metric := range ListMetricsWithPod() {
metric.DeletePartialMatch(prometheus.Labels{
"pod": pod.Name,
"namespace": pod.Namespace,
})
}
}

func InitAllMetrics(registry *prometheus.Registry) {
errormetrics.InitMetrics(registry)
eventcachemetrics.InitMetrics(registry)
Expand Down
86 changes: 86 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package metrics

import (
"testing"

"github.com/prometheus/client_golang/prometheus"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/metrics/eventmetrics"
)

var sampleMsgGenericTracepointUnix = tracing.MsgGenericTracepointUnix{
PolicyName: "fake-policy",
}

func TestPodDelete(t *testing.T) {
reg := prometheus.NewRegistry()
InitAllMetrics(reg)

// Process four events, each one with different combination of pod/namespace.
// These events should be counted by multiple metrics with a "pod" label:
// * tetragon_events_total
// * tetragon_policy_events_total
// * tetragon_syscalls_total
for _, namespace := range []string{"fake-namespace", "other-namespace"} {
for _, pod := range []string{"fake-pod", "other-pod"} {
event := tetragon.GetEventsResponse{
Event: &tetragon.GetEventsResponse_ProcessTracepoint{
ProcessTracepoint: &tetragon.ProcessTracepoint{
Subsys: "raw_syscalls",
Event: "sys_enter",
Process: &tetragon.Process{
Pod: &tetragon.Pod{
Namespace: namespace,
Name: pod,
},
},
Args: []*tetragon.KprobeArgument{
{
Arg: &tetragon.KprobeArgument_LongArg{
LongArg: 0,
},
},
},
},
},
}
eventmetrics.ProcessEvent(&sampleMsgGenericTracepointUnix, &event)
}
}
checkMetricSeriesCount(t, reg, 4)

// Exactly one timeseries should be deleted for each metric (matching both
// pod name and namespace).
DeleteMetricsForPod(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "fake-pod",
Namespace: "fake-namespace",
},
})
checkMetricSeriesCount(t, reg, 3)
}

func checkMetricSeriesCount(t *testing.T, registry *prometheus.Registry, seriesCount int) {
metricFamilies, err := registry.Gather()
require.NoError(t, err)

metricNameToSeries := map[string]*io_prometheus_client.MetricFamily{}
for _, metricFamily := range metricFamilies {
metricNameToSeries[*metricFamily.Name] = metricFamily
}
for _, metric := range []string{"tetragon_events_total", "tetragon_policy_events_total", "tetragon_syscalls_total"} {
metricFamily := metricNameToSeries[metric]
require.NotNil(t, metricFamily)
assert.Len(t, metricFamily.Metric, seriesCount)
}
}
7 changes: 7 additions & 0 deletions pkg/metrics/syscallmetrics/syscallmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(syscallStats)
}

// ListMetricsWithPod returns a list of metrics with "pod" and "namespace" labels.
func ListMetricsWithPod() []*prometheus.MetricVec {
return []*prometheus.MetricVec{
syscallStats.MetricVec,
}
}

func Handle(event interface{}) {
ev, ok := event.(*tetragon.GetEventsResponse)
if !ok {
Expand Down

0 comments on commit 16a9408

Please sign in to comment.