From 7bf5a690b2d18f93599992ea805c7954d08663a9 Mon Sep 17 00:00:00 2001 From: Nick Peluso <10912027+nap32@users.noreply.github.com> Date: Fri, 8 Sep 2023 11:32:12 -0700 Subject: [PATCH] metrics: Add metrics label filter configuration Currently, metrics are all-or-nothing. Certain labels may cause cardinality issues. This patch introduces a new configuration option - MetricsLabelFilter. It is an allow-list for configuring namespace, workload, pod, and binary. Labels that utilize these fields will only add them if configured for it. Fixes: #1037 Signed-off-by: Nick Peluso <10912027+nap32@users.noreply.github.com> --- cmd/tetragon/flags.go | 18 +++---- docs/content/en/docs/reference/helm-chart.md | 1 + install/kubernetes/README.md | 1 + .../templates/tetragon_configmap.yaml | 3 ++ install/kubernetes/values.yaml | 3 ++ pkg/metrics/config/initmetrics.go | 10 ++++ pkg/metrics/consts/consts.go | 1 + pkg/metrics/eventmetrics/eventmetrics.go | 18 +++---- pkg/metrics/eventmetrics/eventmetrics_test.go | 4 +- pkg/metrics/metrics.go | 48 +++++++++++++++++++ pkg/metrics/metrics_test.go | 29 +++++++++++ pkg/metrics/syscallmetrics/syscallmetrics.go | 10 ++-- pkg/option/config.go | 19 ++++++-- 13 files changed, 138 insertions(+), 27 deletions(-) diff --git a/cmd/tetragon/flags.go b/cmd/tetragon/flags.go index 986de7aada9..e6515342a57 100644 --- a/cmd/tetragon/flags.go +++ b/cmd/tetragon/flags.go @@ -5,6 +5,7 @@ package main import ( "github.com/cilium/tetragon/pkg/logger" + "github.com/cilium/tetragon/pkg/metrics/config" "github.com/cilium/tetragon/pkg/option" "github.com/spf13/viper" @@ -31,14 +32,14 @@ const ( keyEnableCiliumAPI = "enable-cilium-api" keyEnableProcessAncestors = "enable-process-ancestors" - keyMetricsServer = "metrics-server" - keyServerAddress = "server-address" - keyGopsAddr = "gops-address" - keyEnableProcessCred = "enable-process-cred" - keyEnableProcessNs = "enable-process-ns" - keyConfigFile = "config-file" - keyTracingPolicy = "tracing-policy" - keyTracingPolicyDir = "tracing-policy-dir" + keyMetricsServer = "metrics-server" + keyMetricsLabelFilter = "metrics-label-filter" + keyServerAddress = "server-address" + keyGopsAddr = "gops-address" + keyEnableProcessCred = "enable-process-cred" + keyEnableProcessNs = "enable-process-ns" + keyTracingPolicy = "tracing-policy" + keyTracingPolicyDir = "tracing-policy-dir" keyCpuProfile = "cpuprofile" keyMemProfile = "memprofile" @@ -114,6 +115,7 @@ func readAndSetFlags() { option.Config.DataCacheSize = viper.GetInt(keyDataCacheSize) option.Config.MetricsServer = viper.GetString(keyMetricsServer) + option.Config.MetricsLabelFilter = config.ParseMetricsLabelFilter(viper.GetString(keyMetricsLabelFilter)) option.Config.ServerAddress = viper.GetString(keyServerAddress) option.Config.ExportFilename = viper.GetString(keyExportFilename) diff --git a/docs/content/en/docs/reference/helm-chart.md b/docs/content/en/docs/reference/helm-chart.md index 51a6b1935b7..0ba35d9cb6b 100644 --- a/docs/content/en/docs/reference/helm-chart.md +++ b/docs/content/en/docs/reference/helm-chart.md @@ -93,6 +93,7 @@ To use [the values available](#values), with `helm install` or `helm upgrade`, u | tetragon.processCacheSize | int | `65536` | | | tetragon.prometheus.address | string | `""` | The address at which to expose metrics. Set it to "" to expose on all available interfaces. | | tetragon.prometheus.enabled | bool | `true` | Whether to enable exposing Tetragon metrics. | +| tetragon.prometheus.metricsLabelFilter | string | `"namespace,workload,pod,binary"` | The labels to include with supporting metrics. The possible values are "namespace", "workload", "pod" and "binary". | | tetragon.prometheus.port | int | `2112` | The port at which to expose metrics. | | tetragon.prometheus.serviceMonitor.enabled | bool | `false` | Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. | | tetragon.prometheus.serviceMonitor.labelsOverride | object | `{}` | The set of labels to place on the 'ServiceMonitor' resource. | diff --git a/install/kubernetes/README.md b/install/kubernetes/README.md index 097ef4490e7..51ee18415dd 100644 --- a/install/kubernetes/README.md +++ b/install/kubernetes/README.md @@ -76,6 +76,7 @@ Helm chart for Tetragon | tetragon.processCacheSize | int | `65536` | | | tetragon.prometheus.address | string | `""` | The address at which to expose metrics. Set it to "" to expose on all available interfaces. | | tetragon.prometheus.enabled | bool | `true` | Whether to enable exposing Tetragon metrics. | +| tetragon.prometheus.metricsLabelFilter | string | `"namespace,workload,pod,binary"` | The labels to include with supporting metrics. The possible values are "namespace", "workload", "pod" and "binary". | | tetragon.prometheus.port | int | `2112` | The port at which to expose metrics. | | tetragon.prometheus.serviceMonitor.enabled | bool | `false` | Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. | | tetragon.prometheus.serviceMonitor.labelsOverride | object | `{}` | The set of labels to place on the 'ServiceMonitor' resource. | diff --git a/install/kubernetes/templates/tetragon_configmap.yaml b/install/kubernetes/templates/tetragon_configmap.yaml index 242e9c9d959..4f5a24eb445 100644 --- a/install/kubernetes/templates/tetragon_configmap.yaml +++ b/install/kubernetes/templates/tetragon_configmap.yaml @@ -37,6 +37,9 @@ data: {{- else }} metrics-server: "" {{- end }} +{{- if .Values.tetragon.prometheus.enabled }} + metrics-label-filter: {{ .Values.tetragon.prometheus.metricsLabelFilter }} +{{- end }} {{- if .Values.tetragon.grpc.enabled }} server-address: {{ .Values.tetragon.grpc.address }} {{- else }} diff --git a/install/kubernetes/values.yaml b/install/kubernetes/values.yaml index 717d10453f0..6ef1eef2e2e 100644 --- a/install/kubernetes/values.yaml +++ b/install/kubernetes/values.yaml @@ -126,6 +126,9 @@ tetragon: address: "" # -- The port at which to expose metrics. port: 2112 + # -- The labels to include with supporting metrics. + # The possible values are "namespace", "workload", "pod" and "binary". + metricsLabelFilter: "namespace,workload,pod,binary" serviceMonitor: # -- Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. enabled: false diff --git a/pkg/metrics/config/initmetrics.go b/pkg/metrics/config/initmetrics.go index cd0da5b637b..4bee6e42e58 100644 --- a/pkg/metrics/config/initmetrics.go +++ b/pkg/metrics/config/initmetrics.go @@ -22,6 +22,8 @@ import ( grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" + + "strings" ) func InitAllMetrics(registry *prometheus.Registry) { @@ -45,3 +47,11 @@ func InitAllMetrics(registry *prometheus.Registry) { registry.MustRegister(grpcmetrics.NewServerMetrics()) version.InitMetrics(registry) } + +func ParseMetricsLabelFilter(labels string) map[string]interface{} { + result := make(map[string]interface{}) + for _, label := range strings.Split(labels, ",") { + result[label] = nil + } + return result +} diff --git a/pkg/metrics/consts/consts.go b/pkg/metrics/consts/consts.go index f7534053268..a55642f537f 100644 --- a/pkg/metrics/consts/consts.go +++ b/pkg/metrics/consts/consts.go @@ -4,3 +4,4 @@ package consts var MetricsNamespace = "tetragon" +var KnownMetricLabelFilters = []string{"namespace", "workload", "pod", "binary"} diff --git a/pkg/metrics/eventmetrics/eventmetrics.go b/pkg/metrics/eventmetrics/eventmetrics.go index 3a4afb826e6..ccca7e0b44c 100644 --- a/pkg/metrics/eventmetrics/eventmetrics.go +++ b/pkg/metrics/eventmetrics/eventmetrics.go @@ -20,12 +20,12 @@ import ( ) var ( - EventsProcessed = metrics.NewCounterVecWithPod(prometheus.CounterOpts{ + EventsProcessed = metrics.MustNewGranularCounter(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "events_total", Help: "The total number of Tetragon events", ConstLabels: nil, - }, []string{"type", "namespace", "workload", "pod", "binary"}) + }, []string{"type"}) FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "flags_total", @@ -39,19 +39,19 @@ var ( ConstLabels: nil, }) - policyStats = metrics.NewCounterVecWithPod(prometheus.CounterOpts{ + policyStats = metrics.MustNewGranularCounter(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "policy_events_total", Help: "Policy events calls observed.", ConstLabels: nil, - }, []string{"policy", "hook", "namespace", "workload", "pod", "binary"}) + }, []string{"policy", "hook"}) ) func InitMetrics(registry *prometheus.Registry) { - registry.MustRegister(EventsProcessed) + registry.MustRegister(EventsProcessed.ToProm()) registry.MustRegister(FlagCount) registry.MustRegister(NotifyOverflowedEvents) - registry.MustRegister(policyStats) + registry.MustRegister(policyStats.ToProm()) } func GetProcessInfo(process *tetragon.Process) (binary, pod, workload, namespace string) { @@ -93,10 +93,10 @@ func handleProcessedEvent(pInfo *tracingpolicy.PolicyInfo, processedEvent interf default: eventType = "unknown" } - EventsProcessed.WithLabelValues(eventType, namespace, workload, pod, binary).Inc() + EventsProcessed.ToProm().WithLabelValues(metrics.FilterMetricLabels(eventType, namespace, workload, pod, binary)...).Inc() if pInfo != nil && pInfo.Name != "" { - policyStats. - WithLabelValues(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary). + policyStats.ToProm(). + WithLabelValues(metrics.FilterMetricLabels(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary)...). Inc() } } diff --git a/pkg/metrics/eventmetrics/eventmetrics_test.go b/pkg/metrics/eventmetrics/eventmetrics_test.go index 58b3a29c627..336f231e535 100644 --- a/pkg/metrics/eventmetrics/eventmetrics_test.go +++ b/pkg/metrics/eventmetrics/eventmetrics_test.go @@ -15,7 +15,7 @@ import ( ) func TestHandleProcessedEvent(t *testing.T) { - assert.NoError(t, testutil.CollectAndCompare(EventsProcessed, strings.NewReader(""))) + assert.NoError(t, testutil.CollectAndCompare(EventsProcessed.ToProm(), strings.NewReader(""))) handleProcessedEvent(nil, nil) // empty process handleProcessedEvent(nil, &tetragon.GetEventsResponse{Event: &tetragon.GetEventsResponse_ProcessKprobe{ProcessKprobe: &tetragon.ProcessKprobe{}}}) @@ -79,7 +79,7 @@ tetragon_events_total{binary="binary_c",namespace="namespace_c",pod="pod_c",type tetragon_events_total{binary="binary_e",namespace="",pod="",type="PROCESS_EXIT",workload=""} 1 tetragon_events_total{binary="binary_e",namespace="namespace_e",pod="pod_e",type="PROCESS_EXIT",workload="workload_e"} 1 `) - assert.NoError(t, testutil.CollectAndCompare(EventsProcessed, expected)) + assert.NoError(t, testutil.CollectAndCompare(EventsProcessed.ToProm(), expected)) } func TestHandleOriginalEvent(t *testing.T) { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 7c232de0b41..23fa1f269ac 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -4,11 +4,15 @@ package metrics import ( + "fmt" + "golang.org/x/exp/slices" "net/http" "sync" "time" "github.com/cilium/tetragon/pkg/logger" + "github.com/cilium/tetragon/pkg/metrics/consts" + "github.com/cilium/tetragon/pkg/option" "github.com/cilium/tetragon/pkg/podhooks" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -27,6 +31,33 @@ var ( deleteDelay = 1 * time.Minute ) +type GranularCounter struct { + counter *prometheus.CounterVec + CounterOpts prometheus.CounterOpts + labels []string + register sync.Once +} + +func MustNewGranularCounter(opts prometheus.CounterOpts, labels []string) *GranularCounter { + for _, label := range labels { + if slices.Contains(consts.KnownMetricLabelFilters, label) { + panic(fmt.Sprintf("labels passed to GranularCounter can't contain any of the following: %v. These labels are added by Tetragon.", consts.KnownMetricLabelFilters)) + } + } + return &GranularCounter{ + CounterOpts: opts, + labels: append(labels, consts.KnownMetricLabelFilters...), + } +} + +func (m *GranularCounter) ToProm() *prometheus.CounterVec { + m.register.Do(func() { + m.labels = FilterMetricLabels(m.labels...) + m.counter = NewCounterVecWithPod(m.CounterOpts, m.labels) + }) + return m.counter +} + // NewCounterVecWithPod is a wrapper around prometheus.NewCounterVec that also registers the metric // to be cleaned up when a pod is deleted. It should be used only to register metrics that have // "pod" and "namespace" labels. @@ -142,3 +173,20 @@ func EnableMetrics(address string) { http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})) http.ListenAndServe(address, nil) } + +// The FilterMetricLabels func takes in string arguments and returns a slice of those strings omitting the labels it is not configured for. +// IMPORTANT! The filtered metric labels must be passed last and in the exact order of consts.KnownMetricLabelFilters. +func FilterMetricLabels(labels ...string) []string { + offset := len(labels) - len(consts.KnownMetricLabelFilters) + if offset < 0 { + logger.GetLogger().WithField("labels", labels).Debug("Not enough labels provided to metrics.FilterMetricLabels.") + return labels + } + result := labels[:offset] + for i, label := range consts.KnownMetricLabelFilters { + if _, ok := option.Config.MetricsLabelFilter[label]; ok { + result = append(result, labels[offset+i]) + } + } + return result +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 82f613471ed..6ee1a27c9a4 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -18,12 +18,41 @@ import ( "github.com/cilium/tetragon/pkg/metrics" "github.com/cilium/tetragon/pkg/metrics/config" "github.com/cilium/tetragon/pkg/metrics/eventmetrics" + "github.com/cilium/tetragon/pkg/option" ) var sampleMsgGenericTracepointUnix = tracing.MsgGenericTracepointUnix{ PolicyName: "fake-policy", } +func TestFilterMetricLabels(t *testing.T) { + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + "pod": nil, + "binary": nil, + } + assert.Equal(t, []string{"type", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"syscall", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary")) + + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + } + assert.Equal(t, []string{"type", "namespace", "workspace"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"syscall", "namespace", "workspace"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"namespace", "workspace"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary")) + + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + "pod": nil, + "binary": nil, + } + assert.Equal(t, []string{"type", "syscall"}, metrics.FilterMetricLabels("type", "syscall")) +} + func TestPodDelete(t *testing.T) { reg := metrics.GetRegistry() config.InitAllMetrics(reg) diff --git a/pkg/metrics/syscallmetrics/syscallmetrics.go b/pkg/metrics/syscallmetrics/syscallmetrics.go index fff33972854..3e796963af7 100644 --- a/pkg/metrics/syscallmetrics/syscallmetrics.go +++ b/pkg/metrics/syscallmetrics/syscallmetrics.go @@ -12,16 +12,16 @@ import ( ) var ( - syscallStats = metrics.NewCounterVecWithPod(prometheus.CounterOpts{ + syscallStats = metrics.MustNewGranularCounter(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "syscalls_total", Help: "System calls observed.", ConstLabels: nil, - }, []string{"syscall", "namespace", "workload", "pod", "binary"}) + }, []string{"syscall"}) ) func InitMetrics(registry *prometheus.Registry) { - registry.MustRegister(syscallStats) + registry.MustRegister(syscallStats.ToProm()) } func Handle(event interface{}) { @@ -46,7 +46,9 @@ func Handle(event interface{}) { } if syscall != "" { - syscallStats.WithLabelValues(syscall, namespace, workload, pod, binary).Inc() + syscallStats.ToProm(). + WithLabelValues(metrics.FilterMetricLabels(syscall, namespace, workload, pod, binary)...). + Inc() } } diff --git a/pkg/option/config.go b/pkg/option/config.go index 20789896753..06dadb6723c 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -11,6 +11,7 @@ import ( "time" "github.com/cilium/tetragon/pkg/logger" + "github.com/cilium/tetragon/pkg/metrics/consts" "github.com/spf13/viper" ) @@ -46,10 +47,11 @@ type config struct { ProcessCacheSize int DataCacheSize int - MetricsServer string - ServerAddress string - TracingPolicy string - TracingPolicyDir string + MetricsServer string + MetricsLabelFilter map[string]interface{} + ServerAddress string + TracingPolicy string + TracingPolicyDir string ExportFilename string ExportFileMaxSizeMB int @@ -93,6 +95,15 @@ var ( // LogOpts contains logger parameters LogOpts: make(map[string]string), + + // Default to logging metrics with the greatest granularity. + MetricsLabelFilter: func() map[string]interface{} { + result := make(map[string]interface{}) + for _, label := range consts.KnownMetricLabelFilters { + result[label] = nil + } + return result + }(), } )