-
Notifications
You must be signed in to change notification settings - Fork 384
/
Copy pathbpf_cgroup.h
437 lines (374 loc) · 12.1 KB
/
bpf_cgroup.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright Authors of Tetragon */
#ifndef __BPF_CGROUP_
#define __BPF_CGROUP_
#include "bpf_event.h"
#include "bpf_helpers.h"
#include "environ_conf.h"
#include "common.h"
#include "process.h"
#define NULL ((void *)0)
#ifndef CGROUP_SUPER_MAGIC
#define CGROUP_SUPER_MAGIC 0x27e0eb /* Cgroupv1 pseudo FS */
#endif
#ifndef CGROUP2_SUPER_MAGIC
#define CGROUP2_SUPER_MAGIC 0x63677270 /* Cgroupv2 pseudo FS */
#endif
/* Our kernfs node name length, can be made 256? */
#define KN_NAME_LENGTH 128
/* Max nested cgroups that are tracked. Arbitrary value, nested cgroups
* that are at a level greater than 32 will be attached to the cgroup
* at level 32.
*/
#define CGROUP_MAX_NESTED_LEVEL 32
typedef enum {
CGROUP_UNTRACKED = 0, /* Cgroup was created but we did not track it */
CGROUP_NEW = 1, /* Cgroup was just created */
CGROUP_RUNNING = 2, /* new => running (fork,exec task inside) */
CGROUP_RUNNING_PROC = 3, /* Generated from pids of procfs */
} cgroup_state;
/* Represent old kernfs node with the kernfs_node_id
* union to read the id in 5.4 kernels and older
*/
struct kernfs_node___old {
union kernfs_node_id id;
};
struct cgroup_tracking_value {
/* State of cgroup */
cgroup_state state;
/* Unique id for the hierarchy this is mostly for cgroupv1 */
__u32 hierarchy_id;
/* The depth this cgroup is at */
__u32 level;
__u32 pad;
/* Cgroup kernfs_node name */
char name[KN_NAME_LENGTH];
}; // All fields aligned so no 'packed' attribute.
struct msg_cgroup_event {
struct msg_common common;
struct msg_execve_key parent;
__u32 cgrp_op; /* Current cgroup operation */
__u32 pid;
__u32 nspid;
__u32 flags;
__u64 ktime;
__u64 cgrpid_tracker; /* Cgroup ID that is used as a tracker for the current cgroup */
__u64 cgrpid; /* Current cgroup ID */
struct cgroup_tracking_value cgrp_data; /* Current cgroup data */
char path[PATH_MAP_SIZE]; /* Current cgroup path */
}; // All fields aligned so no 'packed' attribute.
/* Map to track cgroups per IDs */
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 32768);
__type(key, __u64); /* Key is the cgrpid */
__type(value, struct cgroup_tracking_value);
} tg_cgrps_tracking_map SEC(".maps");
/* Heap used to construct a cgroup_tracking_value */
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, __s32);
__type(value, struct cgroup_tracking_value);
} tg_cgrps_tracking_heap SEC(".maps");
/* Heap used to construct a msg_cgroup_event */
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct msg_cgroup_event);
} tg_cgrps_msg_heap SEC(".maps");
/**
* get_cgroup_kn_name() Returns a pointer to the kernfs node name
* @cgrp: target kernfs node
*
* Returns a pointer to the kernfs node name on success, NULL on failures.
*/
FUNC_INLINE const char *__get_cgroup_kn_name(const struct kernfs_node *kn)
{
const char *name = NULL;
if (kn)
probe_read_kernel(&name, sizeof(name), _(&kn->name));
return name;
}
/**
* get_cgroup_kn_id() Returns the kernfs node id
* @cgrp: target kernfs node
*
* Returns the kernfs node id on success, zero on failures.
*/
FUNC_INLINE __u64 __get_cgroup_kn_id(const struct kernfs_node *kn)
{
__u64 id = 0;
if (!kn)
return id;
/* Kernels prior to 5.5 have the kernfs_node_id, but distros (RHEL)
* seem to have kernfs_node_id defined for UAPI reasons even though
* its not used here directly. To resolve this walk struct for id.id
*/
if (bpf_core_field_exists(((struct kernfs_node___old *)0)->id.id)) {
struct kernfs_node___old *old_kn;
old_kn = (void *)kn;
if (BPF_CORE_READ_INTO(&id, old_kn, id.id) != 0)
return 0;
} else {
probe_read_kernel(&id, sizeof(id), _(&kn->id));
}
return id;
}
/**
* __get_cgroup_kn() Returns the kernfs_node of the cgroup
* @cgrp: target cgroup
*
* Returns the kernfs_node of the cgroup on success, NULL on failures.
*/
FUNC_INLINE struct kernfs_node *__get_cgroup_kn(const struct cgroup *cgrp)
{
struct kernfs_node *kn = NULL;
if (cgrp)
probe_read_kernel(&kn, sizeof(cgrp->kn), _(&cgrp->kn));
return kn;
}
/**
* get_cgroup_hierarchy_id() Returns the cgroup hierarchy id
* @cgrp: target cgroup
*
* Returns the cgroup hierarchy id. Make sure you pass a valid
* cgroup, this can not fail.
*
* Returning zero means the cgroup is running on the default
* hierarchy.
*/
FUNC_INLINE __u32 get_cgroup_hierarchy_id(const struct cgroup *cgrp)
{
__u32 id;
BPF_CORE_READ_INTO(&id, cgrp, root, hierarchy_id);
return id;
}
/**
* get_cgroup_name() Returns a pointer to the cgroup name
* @cgrp: target cgroup
*
* Returns a pointer to the cgroup node name on success that can
* be read with probe_read_kernel(). NULL on failures.
*/
FUNC_INLINE const char *get_cgroup_name(const struct cgroup *cgrp)
{
const char *name;
if (unlikely(!cgrp))
return NULL;
if (BPF_CORE_READ_INTO(&name, cgrp, kn, name) != 0)
return NULL;
return name;
}
/**
* get_cgroup_level() Returns the cgroup level
* @cgrp: target cgroup
*
* Returns the cgroup level, or 0 if it can not be retrieved.
*/
FUNC_INLINE __u32 get_cgroup_level(const struct cgroup *cgrp)
{
__u32 level = 0;
probe_read_kernel(&level, sizeof(level), _(&cgrp->level));
return level;
}
/**
* get_cgroup_id() Returns cgroup id
* @cgrp: target cgroup
*
* Returns the cgroup id of the target cgroup on success, zero on failures.
*/
FUNC_INLINE __u64 get_cgroup_id(const struct cgroup *cgrp)
{
struct kernfs_node *kn;
kn = __get_cgroup_kn(cgrp);
return __get_cgroup_kn_id(kn);
}
/**
* get_task_cgroup() Returns the accurate or desired cgroup of the css of
* current task that we want to operate on.
* @task: must be current task.
* @cgrpfs_ver: cgroup file system magic.
* @subsys_idx: index of the desired cgroup_subsys_state part of css_set.
* Passing a zero as a subsys_idx is fine assuming you want that.
* @error_flags: error flags that will be ORed to indicate errors on
* failures.
*
* If on Cgroupv2 returns the default cgroup associated with the task css_set.
* If on Cgroupv1 returns the cgroup indexed at subsys_idx of the task
* css_set.
* On failures NULL is returned and the error_flags is ORed to indicate the
* corresponding error.
*
* To get cgroup and kernfs node information we want to operate on the right
* cgroup hierarchy which is setup by user space. However due to the
* incompatibility between cgroup v1 and v2; how user space initialize and
* install cgroup controllers, etc, it can be difficult.
*
* Use this helper and pass the css index that you consider accurate and
* which can be discovered at runtime in user space.
* Usually it is the 'memory' or 'pids' indexes by reading /proc/cgroups
* file in case of Cgroupv1 where each line number is the index starting
* from zero without counting first comment line.
*/
FUNC_INLINE struct cgroup *
get_task_cgroup(struct task_struct *task, __u64 cgrpfs_ver, __u32 subsys_idx, __u32 *error_flags)
{
struct cgroup_subsys_state *subsys;
struct css_set *cgroups;
struct cgroup *cgrp = NULL;
probe_read_kernel(&cgroups, sizeof(cgroups), _(&task->cgroups));
if (unlikely(!cgroups)) {
*error_flags |= EVENT_ERROR_CGROUPS;
return cgrp;
}
/* If we are in Cgroupv2 return the default css_set cgroup */
if (cgrpfs_ver == CGROUP2_SUPER_MAGIC) {
probe_read_kernel(&cgrp, sizeof(cgrp), _(&cgroups->dfl_cgrp));
if (!cgrp)
*error_flags |= EVENT_ERROR_CGROUP_SUBSYSCGRP;
return cgrp;
}
/* We are interested only in the cpuset, memory or pids controllers
* which are indexed at 0, 4 and 11 respectively assuming all controllers
* are compiled in.
* When we use the controllers indexes we will first discover these indexes
* dynamically in user space which will work on all setups from reading
* file: /proc/cgroups. If we fail to discover the indexes then passing
* a default index zero should be fine assuming we also want that.
*
* Reference: https://elixir.bootlin.com/linux/v5.19/source/include/linux/cgroup_subsys.h
*
* Notes:
* Newer controllers should be appended at the end. controllers
* that are not upstreamed may mess the calculation here
* especially if they happen to be before the desired subsys_idx,
* we fail.
*/
if (unlikely(subsys_idx > pids_cgrp_id)) {
*error_flags |= EVENT_ERROR_CGROUP_SUBSYS;
return cgrp;
}
/* Read css from the passed subsys index to ensure that we operate
* on the desired controller. This allows user space to be flexible
* and chose the right per cgroup subsystem to use in order to
* support as much as workload as possible. It also reduces errors
* in a significant way.
*/
probe_read_kernel(&subsys, sizeof(subsys), _(&cgroups->subsys[subsys_idx]));
if (unlikely(!subsys)) {
*error_flags |= EVENT_ERROR_CGROUP_SUBSYS;
return cgrp;
}
probe_read_kernel(&cgrp, sizeof(cgrp), _(&subsys->cgroup));
if (!cgrp)
*error_flags |= EVENT_ERROR_CGROUP_SUBSYSCGRP;
return cgrp;
}
/**
* __tg_get_current_cgroup_id() Returns the accurate cgroup id of current task.
* @cgrp: cgroup target of current task.
* @cgrpfs_ver: Cgroupfs Magic number either Cgroupv1 or Cgroupv2
*
* It handles both cgroupv2 and cgroupv1.
* If @cgrpfs_ver is default cgroupv2 hierarchy, then it uses the bpf
* helper bpf_get_current_cgroup_id() to retrieve the cgroup id. Otherwise
* it falls back on using the passed @cgrp
*
* Returns the cgroup id of current task on success, zero on failures.
*/
FUNC_INLINE __u64
__tg_get_current_cgroup_id(struct cgroup *cgrp, __u64 cgrpfs_ver)
{
/*
* Try the bpf helper on the default hierarchy if available
* and if we are running in unified cgroupv2
*/
if (bpf_core_enum_value_exists(enum bpf_func_id,
BPF_FUNC_get_current_cgroup_id) &&
cgrpfs_ver == CGROUP2_SUPER_MAGIC) {
return get_current_cgroup_id();
} else {
return get_cgroup_id(cgrp);
}
}
/**
* tg_get_current_cgroup_id() Returns the accurate cgroup id of current task.
*
* It works similar to __tg_get_current_cgroup_id, but computes the cgrp if it is needed.
* Returns the cgroup id of current task on success, zero on failures.
*/
FUNC_INLINE __u64 tg_get_current_cgroup_id(void)
{
__u32 error_flags;
struct cgroup *cgrp;
__u64 cgrpfs_magic = 0;
struct task_struct *task;
struct tetragon_conf *conf;
int zero = 0, subsys_idx = 0;
conf = map_lookup_elem(&tg_conf_map, &zero);
if (conf) {
/* Select which cgroup version */
cgrpfs_magic = conf->cgrp_fs_magic;
subsys_idx = conf->tg_cgrpv1_subsys_idx;
}
/*
* Try the bpf helper on the default hierarchy if available
* and if we are running in unified cgroupv2
*/
if (bpf_core_enum_value_exists(enum bpf_func_id,
BPF_FUNC_get_current_cgroup_id) &&
cgrpfs_magic == CGROUP2_SUPER_MAGIC) {
return get_current_cgroup_id();
}
task = (struct task_struct *)get_current_task();
// NB: error_flags are ignored for now
cgrp = get_task_cgroup(task, cgrpfs_magic, subsys_idx, &error_flags);
if (!cgrp)
return 0;
return get_cgroup_id(cgrp);
}
/**
* __get_cgrp_tracking_val_heap() Get a cgroup_tracking_val from the
* tg_cgrps_tracking_heap map while setting its fields.
*/
FUNC_INLINE struct cgroup_tracking_value *
__get_cgrp_tracking_val_heap(cgroup_state state, __u32 hierarchy_id,
__u32 level)
{
int zero = 0;
struct cgroup_tracking_value *heap;
heap = map_lookup_elem(&tg_cgrps_tracking_heap, &zero);
if (!heap)
return heap;
memset(heap, 0, sizeof(struct cgroup_tracking_value));
heap->state = state;
heap->hierarchy_id = hierarchy_id;
heap->level = level;
return heap;
}
/**
* __init_cgrp_tracking_val_heap() Initialize a cgroup_tracking_val that is
* obtained with __get_cgrp_tracking_val_heap(). It will initialize and
* set the cgroup name too.
*/
FUNC_INLINE struct cgroup_tracking_value *
__init_cgrp_tracking_val_heap(struct cgroup *cgrp, cgroup_state state)
{
const char *name;
struct kernfs_node *kn;
__u32 level, hierarchy_id;
struct cgroup_tracking_value *heap;
hierarchy_id = get_cgroup_hierarchy_id(cgrp);
level = get_cgroup_level(cgrp);
heap = __get_cgrp_tracking_val_heap(state, hierarchy_id, level);
if (!heap)
return heap;
kn = __get_cgroup_kn(cgrp);
name = __get_cgroup_kn_name(kn);
if (name)
probe_read_str(&heap->name, KN_NAME_LENGTH - 1, name);
return heap;
}
#endif // __BPF_CGROUP_