Skip to content

Commit

Permalink
backports/v1.0: Add a metric to provide per-event missed events
Browse files Browse the repository at this point in the history
[upstream commit d5a7ee2]

Example:
$ curl localhost:2112/metrics 2> /dev/null | grep 'sent_events_total\|missed_events_total\|ringbuf_perf_event_lost_total\|ringbuf_queue_lost_total\|msg_op_total\|ringbuf_queue_received_total'
tetragon_missed_events_total{msg_op="13"} 73300
tetragon_missed_events_total{msg_op="23"} 28
tetragon_missed_events_total{msg_op="24"} 606
tetragon_missed_events_total{msg_op="5"} 20
tetragon_missed_events_total{msg_op="7"} 22
tetragon_msg_op_total{msg_op="13"} 4.268532e+06
tetragon_msg_op_total{msg_op="23"} 12444
tetragon_msg_op_total{msg_op="24"} 2110
tetragon_msg_op_total{msg_op="5"} 11908
tetragon_msg_op_total{msg_op="7"} 12447
tetragon_ringbuf_perf_event_lost_total 73976
tetragon_ringbuf_queue_lost_total 0
tetragon_ringbuf_queue_received_total 4.307441e+06

This PR adds an eBPF map collector for getting metrics directly from a
map. This map contains information about the return values of all
perf_event_output calls (i.e. if it fails). This provides us the
ability to determine missed events per type. Metric
tetragon_missed_events_total contains such information.

Using the previous example, we can see that we lost 73976 events from
the user-space (tetragon_ringbuf_perf_event_lost_total). This is the same
as the sum of all tetragon_missed_events_total metrics gathered from the
kernel.

Signed-off-by: Anastasios Papagiannis <tasos.papagiannnis@gmail.com>
  • Loading branch information
tpapagian authored and kkourt committed Nov 3, 2023
1 parent 90c0769 commit 9eacb40
Show file tree
Hide file tree
Showing 20 changed files with 110 additions and 18 deletions.
1 change: 1 addition & 0 deletions bpf/alignchecker/bpf_alignchecker.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ struct execve_map_value _execve_map_value;
struct event_config _event_config;
struct tetragon_conf _tetragon_conf;
struct cgroup_tracking_value _cgroup_tracking_value;
struct kernel_stats _kernel_stats;
2 changes: 1 addition & 1 deletion bpf/cgroup/bpf_cgroup_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ send_cgrp_event(struct bpf_raw_tracepoint_args *ctx,
memcpy(&msg->cgrp_data.name, &cgrp_track->name, KN_NAME_LENGTH);
probe_read_str(&msg->path, PATH_MAP_SIZE - 1, path);

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);
perf_event_output_metric(ctx, MSG_OP_CGROUP, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);

return 0;
}
Expand Down
2 changes: 2 additions & 0 deletions bpf/lib/msg_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ enum msg_ops {

MSG_OP_CGROUP = 25,

MSG_OP_LOADER = 26,

MSG_OP_MAX,
};

Expand Down
26 changes: 26 additions & 0 deletions bpf/lib/process.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,30 @@ execve_joined_info_map_get(__u64 tid)

_Static_assert(sizeof(struct execve_map_value) % 8 == 0,
"struct execve_map_value should have size multiple of 8 bytes");

struct kernel_stats {
__u64 sent_failed[256];
};

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, __u32);
__type(value, struct kernel_stats);
__uint(max_entries, 1);
} tg_stats_map SEC(".maps");

static inline __attribute__((always_inline)) void perf_event_output_metric(void *ctx, u8 metric, void *map, u64 flags, void *data, u64 size)
{
struct kernel_stats *valp;
__u32 zero = 0;
long err;

err = perf_event_output(ctx, map, flags, data, size);
if (err < 0) {
valp = map_lookup_elem(&tg_stats_map, &zero);
if (valp)
__sync_fetch_and_add(&valp->sent_failed[metric], 1);
}
}

#endif //_PROCESS__
2 changes: 1 addition & 1 deletion bpf/process/bpf_execve_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,6 @@ execve_send(struct sched_execve_args *ctx)
sizeof(struct msg_capabilities) +
sizeof(struct msg_cred_minimal) + sizeof(struct msg_ns) +
sizeof(struct msg_execve_key) + p->size);
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, event, size);
perf_event_output_metric(ctx, MSG_OP_EXECVE, &tcpmon_map, BPF_F_CURRENT_CPU, event, size);
return 0;
}
3 changes: 1 addition & 2 deletions bpf/process/bpf_exit.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ static inline __attribute__((always_inline)) void event_exit_send(void *ctx, __u
probe_read(&exit->info.code, sizeof(exit->info.code),
_(&task->exit_code));

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, exit,
size);
perf_event_output_metric(ctx, MSG_OP_EXIT, &tcpmon_map, BPF_F_CURRENT_CPU, exit, size);
}
execve_map_delete(tgid);
}
Expand Down
3 changes: 1 addition & 2 deletions bpf/process/bpf_fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ BPF_KPROBE(event_wake_up_new_task, struct task_struct *task)
/* Last: set any encountered error when setting cgroup info */
msg.flags |= error_flags;

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, &msg,
size);
perf_event_output_metric(ctx, MSG_OP_CLONE, &tcpmon_map, BPF_F_CURRENT_CPU, &msg, size);
}
return 0;
}
2 changes: 1 addition & 1 deletion bpf/process/bpf_generic_kprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ generic_kprobe_actions(void *ctx)
__attribute__((section("kprobe/12"), used)) int
generic_kprobe_output(void *ctx)
{
return generic_output(ctx, (struct bpf_map_def *)&process_call_heap);
return generic_output(ctx, (struct bpf_map_def *)&process_call_heap, MSG_OP_GENERIC_KPROBE);
}

__attribute__((section(OVERRIDE), used)) int
Expand Down
2 changes: 1 addition & 1 deletion bpf/process/bpf_generic_retkprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,6 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret)
: [total] "+r"(total)
:);
e->common.size = total;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
perf_event_output_metric(ctx, MSG_OP_GENERIC_KPROBE, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
return 0;
}
2 changes: 1 addition & 1 deletion bpf/process/bpf_generic_tracepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ generic_tracepoint_actions(void *ctx)
__attribute__((section("tracepoint/12"), used)) int
generic_tracepoint_output(void *ctx)
{
return generic_output(ctx, (struct bpf_map_def *)&tp_heap);
return generic_output(ctx, (struct bpf_map_def *)&tp_heap, MSG_OP_GENERIC_TRACEPOINT);
}

char _license[] __attribute__((section("license"), used)) = "Dual BSD/GPL";
2 changes: 1 addition & 1 deletion bpf/process/bpf_generic_uprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,5 +222,5 @@ generic_uprobe_actions(void *ctx)
__attribute__((section("uprobe/12"), used)) int
generic_uprobe_output(void *ctx)
{
return generic_output(ctx, (struct bpf_map_def *)&process_call_heap);
return generic_output(ctx, (struct bpf_map_def *)&process_call_heap, MSG_OP_GENERIC_UPROBE);
}
5 changes: 2 additions & 3 deletions bpf/process/bpf_loader.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ struct {
__type(value, struct __perf_event_attr);
} attr_heap SEC(".maps");

#define VM_EXEC 0x00000004
#define MSG_OP_LOADER 26
#define VM_EXEC 0x00000004

#define ATTR_BIT_MMAP BIT_ULL(8)
#define ATTR_BIT_MMAP2 BIT_ULL(23)
Expand Down Expand Up @@ -134,6 +133,6 @@ loader_kprobe(struct pt_regs *ctx)
msg->common.op = MSG_OP_LOADER;
msg->common.flags = 0;

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, total);
perf_event_output_metric(ctx, MSG_OP_LOADER, &tcpmon_map, BPF_F_CURRENT_CPU, msg, total);
return 0;
}
5 changes: 2 additions & 3 deletions bpf/process/data_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ __do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes)
return err;

msg->common.size = offsetof(struct msg_data, arg) + bytes;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg,
msg->common.size);
perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, msg->common.size);
return bytes;
b:
return -1;
Expand Down Expand Up @@ -106,7 +105,7 @@ __do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done)
: [size] "+r"(size)
:);
msg->common.size = size;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);
perf_event_output_metric(ctx, MSG_OP_DATA, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);
return ret;
}

Expand Down
4 changes: 2 additions & 2 deletions bpf/process/types/basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -2183,7 +2183,7 @@ generic_actions(void *ctx, struct bpf_map_def *heap,
}

static inline __attribute__((always_inline)) long
generic_output(void *ctx, struct bpf_map_def *heap)
generic_output(void *ctx, struct bpf_map_def *heap, u8 op)
{
struct msg_generic_kprobe *e;
int zero = 0;
Expand Down Expand Up @@ -2226,7 +2226,7 @@ generic_output(void *ctx, struct bpf_map_def *heap)
:
: [total] "+r"(total)
:);
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
perf_event_output_metric(ctx, op, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
return 1;
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/alignchecker/alignchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ func CheckStructAlignments(pathToObj string) error {

// cgroup
"cgroup_tracking_value": {cgrouptrackmap.CgrpTrackingValue{}},

// metrics
"kernel_stats": {processapi.KernelStats{}},
}

return alignchecker.CheckStructAlignments(pathToObj, alignments, true)
Expand Down
4 changes: 4 additions & 0 deletions pkg/api/processapi/processapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,7 @@ type MsgCgroupEvent struct {
CgrpData MsgCgroupData `align:"cgrp_data"` // Complementary cgroup data
Path [CGROUP_PATH_LENGTH]byte `align:"path"` // Full path of the cgroup on fs
}

type KernelStats struct {
SentFailed [256]uint64 `align:"sent_failed"`
}
52 changes: 52 additions & 0 deletions pkg/metrics/eventmetrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package eventmetrics

import (
"path/filepath"
"strconv"

"github.com/cilium/ebpf"
"github.com/cilium/tetragon/pkg/api/processapi"
"github.com/cilium/tetragon/pkg/option"
"github.com/prometheus/client_golang/prometheus"
)

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
type bpfCollector struct{}

func NewBPFCollector() prometheus.Collector {
return &bpfCollector{}
}

func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- MissedEvents.Desc()
}

func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
mapHandle, err := ebpf.LoadPinnedMap(filepath.Join(option.Config.MapDir, "tg_stats_map"), nil)
if err != nil {
return
}
defer mapHandle.Close()

var zero uint32
var allCpuValue []processapi.KernelStats
if err := mapHandle.Lookup(zero, &allCpuValue); err != nil {
return
}

sum := processapi.KernelStats{}
for _, val := range allCpuValue {
for i, data := range val.SentFailed {
sum.SentFailed[i] += data
}
}

for i, data := range sum.SentFailed {
if data > 0 {
ch <- MissedEvents.MustMetric(float64(data), strconv.Itoa(i))
}
}
}
5 changes: 5 additions & 0 deletions pkg/metrics/eventmetrics/eventmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var (
Help: "The total number of Tetragon events",
ConstLabels: nil,
}, []string{"type"})
MissedEvents = metrics.NewBPFCounter(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "missed_events_total"),
"The total number of Tetragon events per type that are failed to sent from the kernel.",
[]string{"msg_op"}, nil,
))
FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "flags_total",
Expand Down
1 change: 1 addition & 0 deletions pkg/metrics/metricsconfig/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func InitAllMetrics(registry *prometheus.Registry) {
observer.NewBPFCollector(),
process.NewBPFCollector(),
))
registry.MustRegister(eventmetrics.NewBPFCollector())

// register common third-party collectors
registry.MustRegister(collectors.NewGoCollector())
Expand Down
2 changes: 2 additions & 0 deletions pkg/sensors/base/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ var (
/* Internal statistics for debugging */
ExecveStats = program.MapBuilder("execve_map_stats", Execve)
ExecveJoinMapStats = program.MapBuilder("tg_execve_joined_info_map_stats", ExecveBprmCommit)
StatsMap = program.MapBuilder("tg_stats_map", Execve)

sensor = sensors.Sensor{
Name: "__base__",
Expand Down Expand Up @@ -99,6 +100,7 @@ func GetDefaultMaps() []*program.Map {
NamesMap,
TCPMonMap,
TetragonConfMap,
StatsMap,
}
return maps

Expand Down

0 comments on commit 9eacb40

Please sign in to comment.