From 52277cc21c83c5fe2bf13f14e1f72866a9ce0715 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@gmail.com>
Date: Fri, 18 Aug 2023 18:02:28 +0200
Subject: [PATCH] process: comment how we process thread IDs in bpf and
 userspace

Our current rules now for how we collect thread IDs are:

During bpf we send both TID and PID where:
- fork      TID == PID
- exec      TID == PID
- [ku]probe or tracepoint  TID could be different as per thread ID.
- exit      TID == PID  => this is to match the exec evnet.

At user space we have one entry that is the thread leader collected
either during clone or exec:
- fork      TID == PID   (asserts TID == PID received from BPF side)
- exec      TID == PID   ditto
- exit      TID == PID  => this is to match the exec evnet.
- [ku]probe or tracepoint  We make a copy of the process that is the
  thread leader in the process cache then update its TID that was recorded
  from bpf side. The copy is needed so we don't corrupt gRPC handling.

Now this is by far complete, future fixes should include:
- Record the capabilities and namespaces per thread in BPF side for
  [ku]probe and tracpoints and ensure to not overwrite the fields of the
  thread leader that are in the execve_map in bpf side or
  the user space process cache with fields of another thread, as that
  cache contains only one thread the leader. [Need to recheck sources]

- Also ensure that [ku]probe and tracpoints events do use the per thread
  capabilities and namespaces fields collected from bpf side instead of
  the fields of the leader that is in the process cache which were
  collected during exec or during match filters... and not at current
  time.

- Ensure that we always collect thread leader fields and we cache them
  in our shadow state in execve_map and user space process cache,
  reguardless of --enable-process-creds and a like flags.

- When all these fixed and the model is clear, maybe by then we can remove
  the extra recording of TIDs from bpf side during fork/clone, exec and
  exit as we should by then asserted our model.

Note that sending the TID that equals PID on clone exec and exit from BPF
side and the assertion on the user space helps to catch errors for other
Tetragon variants that use the OSS version as a base with custom
sensors.  The downside of this is we are just sending an extra 4bytes
from bpf which is also fine.

Signed-off-by: Djalal Harouni <tixxdz@gmail.com>
---
 bpf/process/bpf_execve_event.c | 12 ++++++++----
 bpf/process/bpf_exit.h         | 10 +++++++++-
 bpf/process/bpf_fork.c         | 11 ++++++++---
 bpf/process/generic_calls.h    |  5 ++++-
 pkg/eventcache/eventcache.go   | 26 +++++++++++++++++++------
 pkg/grpc/exec/exec.go          |  4 +++-
 pkg/grpc/tracing/tracing.go    | 35 +++++++++++++++++++++++++++++-----
 pkg/process/process.go         | 15 +++++++++++----
 8 files changed, 93 insertions(+), 25 deletions(-)

diff --git a/bpf/process/bpf_execve_event.c b/bpf/process/bpf_execve_event.c
index d1b736767e6..b163b960c4c 100644
--- a/bpf/process/bpf_execve_event.c
+++ b/bpf/process/bpf_execve_event.c
@@ -177,10 +177,14 @@ event_execve(struct sched_execve_args *ctx)
 
 	p = &event->process;
 	p->flags = EVENT_EXECVE;
-	/* Send the TGID and TID, as during an execve all threads other
-	 * than the calling thread are destroyed, but since we hook late
-	 * during the execve then the calling thread at the hook time is
-	 * already the new thread group leader.
+	/**
+	 * Per thread tracking rules TID == PID :
+	 *  At exec all threads other than the calling one are destroyed, so
+	 *  current becomes the new thread leader since we hook late during
+	 *  execve.
+	 *
+	 *  TODO: for now we send the TID but later we should remove this
+	 *    and set it directly inside user space.
 	 */
 	p->pid = pid >> 32;
 	p->tid = (__u32)pid;
diff --git a/bpf/process/bpf_exit.h b/bpf/process/bpf_exit.h
index de6436cfca6..20ac56764cb 100644
--- a/bpf/process/bpf_exit.h
+++ b/bpf/process/bpf_exit.h
@@ -56,7 +56,15 @@ static inline __attribute__((always_inline)) void event_exit_send(void *ctx, __u
 		exit->current.pad[3] = 0;
 		exit->current.ktime = enter->key.ktime;
 
-		/* We track and report only thread leader so here tgid == tid */
+		/**
+		 * Per thread tracking rules TID == PID :
+		 *  We want the exit event to match the exec one, and since during exec
+		 *  we report the thread group leader, do same here as we read the exec
+		 *  entry from the execve_map anyway and explicitly set it to the to tgid.
+		 *
+		 *  TODO: for now we send the TID but later we should remove this
+		 *    and set it directly inside user space.
+		 */
 		exit->info.tid = tgid;
 		probe_read(&exit->info.code, sizeof(exit->info.code),
 			   _(&task->exit_code));
diff --git a/bpf/process/bpf_fork.c b/bpf/process/bpf_fork.c
index e27138856b1..75ef663bd6b 100644
--- a/bpf/process/bpf_fork.c
+++ b/bpf/process/bpf_fork.c
@@ -75,9 +75,14 @@ BPF_KPROBE(event_wake_up_new_task, struct task_struct *task)
 			.common.ktime = curr->key.ktime,
 			.parent = curr->pkey,
 			.tgid = curr->key.pid,
-			/* Since we generate one event per thread group, then when
-			 * the task wakes up, there will be only one thread here:
-			 * the thread group leader. Pass its thread id to user-space.
+			/**
+			 * Per thread tracking rules TID == PID :
+			 *  Since we generate one event per thread group, then when this task
+			 *  wakes up it will be the only one in the thread group, and it is
+			 *  the leader. Ensure to pass TID to user space.
+			 *
+			 *  TODO: for now we send the TID but later we should remove this
+			 *    and set it directly inside user space.
 			 */
 			.tid = BPF_CORE_READ(task, pid),
 			.ktime = curr->key.ktime,
diff --git a/bpf/process/generic_calls.h b/bpf/process/generic_calls.h
index be05ab6f2eb..cd448d9fff5 100644
--- a/bpf/process/generic_calls.h
+++ b/bpf/process/generic_calls.h
@@ -79,7 +79,10 @@ generic_process_init(struct msg_generic_kprobe *e, u8 op, struct event_config *c
 
 	e->action = 0;
 
-	/* Initialize with the calling TID */
+	/**
+	 * Per thread tracking rules TID is the calling thread:
+	 *  At kprobes, tracpoints etc we report the calling thread ID to user space.
+	 */
 	e->tid = (__u32)get_current_pid_tgid();
 }
 
diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go
index 502d32570fb..37084635f22 100644
--- a/pkg/eventcache/eventcache.go
+++ b/pkg/eventcache/eventcache.go
@@ -69,10 +69,17 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
 	}
 
 	if internal != nil {
+		// When we report the per thread fields, take a copy
+		// of the thread leader from the cache then update the corresponding
+		// per thread fields.
+		//
+		// The cost to get this is relatively high because it requires a
+		// deep copy of all the fields of the thread leader from the cache in
+		// order to safely modify them, to not corrupt gRPC streams.
+		//
+		// TODO: improve this so it copies only the shared fileds and directly
+		//  update the per thread feilds with values recorded from BPF.
 		proc := internal.GetProcessCopy()
-		// The TID of the cached process can be different from the
-		// TID that triggered the event, so always use the recorded
-		// one from bpf.
 		process.UpdateEventProcessTid(proc, tid)
 		ev.SetProcess(proc)
 	} else {
@@ -97,10 +104,17 @@ func HandleGenericEvent(internal *process.ProcessInternal, ev notify.Event, tid
 		return ErrFailedToGetPodInfo
 	}
 
+	// When we report the per thread fields, take a copy
+	// of the thread leader from the cache then update the corresponding
+	// per thread fields.
+	//
+	// The cost to get this is relatively high because it requires a
+	// deep copy of all the fields of the thread leader from the cache in
+	// order to safely modify them, to not corrupt gRPC streams.
+	//
+	// TODO: improve this so it copies only the shared fileds and directly
+	//  update the per thread feilds with values recorded from BPF.
 	proc := internal.GetProcessCopy()
-	// The TID of the cached process can be different from the
-	// TID that triggered the event, so always use the recorded
-	// one from bpf.
 	process.UpdateEventProcessTid(proc, tid)
 	ev.SetProcess(proc)
 	return nil
diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go
index 57a27109654..5c1e318a8df 100644
--- a/pkg/grpc/exec/exec.go
+++ b/pkg/grpc/exec/exec.go
@@ -288,7 +288,9 @@ func GetProcessExit(event *MsgExitEventUnix) *tetragon.ProcessExit {
 	code := event.Info.Code >> 8
 	signal := readerexec.Signal(event.Info.Code & 0xFF)
 
-	// Ensure that we get PID == TID
+	// Per thread tracking rules PID == TID: ensure that we get TID equals PID.
+	//
+	// TODO: remove the TID from bpf side and the following check.
 	if event.ProcessKey.Pid != event.Info.Tid {
 		logger.GetLogger().WithFields(logrus.Fields{
 			"event.name":        "Exit",
diff --git a/pkg/grpc/tracing/tracing.go b/pkg/grpc/tracing/tracing.go
index cf470ca6082..d9f22e0321d 100644
--- a/pkg/grpc/tracing/tracing.go
+++ b/pkg/grpc/tracing/tracing.go
@@ -256,8 +256,17 @@ func GetProcessKprobe(event *MsgGenericKprobeUnix) *tetragon.ProcessKprobe {
 	}
 
 	if proc != nil {
+		// At kprobes we report the per thread fields, so take a copy
+		// of the thread leader from the cache then update the corresponding
+		// per thread fields.
+		//
+		// The cost to get this is relatively high because it requires a
+		// deep copy of all the fields of the thread leader from the cache in
+		// order to safely modify them, to not corrupt gRPC streams.
+		//
+		// TODO: improve this so it copies only the shared fields and directly
+		//  update the per thread feilds with values recorded from BPF.
 		tetragonEvent.Process = proc.GetProcessCopy()
-		// Use the bpf recorded TID to update the event
 		process.UpdateEventProcessTid(tetragonEvent.Process, &event.Tid)
 	}
 	if parent != nil {
@@ -356,10 +365,17 @@ func (msg *MsgGenericTracepointUnix) HandleMessage() *tetragon.GetEventsResponse
 	}
 
 	if proc != nil {
-		tetragonEvent.Process = proc.GetProcessCopy()
-		// Use the bpf recorded TID to update the event
+		// At tracepoints we report the per thread fields, so take a copy
+		// of the thread leader from the cache then update the corresponding
+		// per thread fields.
+		//
 		// The cost to get this is relatively high because it requires a
-		// deep copyo of the process in order to safely modify it.
+		// deep copy of all the fields of the thread leader from the cache in
+		// order to safely modify them, to not corrupt gRPC streams.
+		//
+		// TODO: improve this so it copies only the shared fields and directly
+		//  update the per thread feilds with values recorded from BPF.
+		tetragonEvent.Process = proc.GetProcessCopy()
 		process.UpdateEventProcessTid(tetragonEvent.Process, &msg.Tid)
 	}
 
@@ -576,8 +592,17 @@ func GetProcessUprobe(event *MsgGenericUprobeUnix) *tetragon.ProcessUprobe {
 	}
 
 	if proc != nil {
+		// At uprobes we report the per thread fields, so take a copy
+		// of the thread leader from the cache then update the corresponding
+		// per thread fields.
+		//
+		// The cost to get this is relatively high because it requires a
+		// deep copy of all the fields of the thread leader from the cache in
+		// order to safely modify them, to not corrupt gRPC streams.
+		//
+		// TODO: improve this so it copies only the shared fields and directly
+		//  update the per thread feilds with values recorded from BPF.
 		tetragonEvent.Process = proc.GetProcessCopy()
-		// Use the bpf recorded TID to update the event
 		process.UpdateEventProcessTid(tetragonEvent.Process, &event.Tid)
 	}
 	return tetragonEvent
diff --git a/pkg/process/process.go b/pkg/process/process.go
index 343e1d7a1a1..4eab71b2d79 100644
--- a/pkg/process/process.go
+++ b/pkg/process/process.go
@@ -206,9 +206,13 @@ func initProcessInternalExec(
 	ns := namespace.GetMsgNamespaces(namespaces)
 	binary := path.GetBinaryAbsolutePath(process.Filename, cwd)
 
-	// Ensure that exported events have the TID set. For events from Kernel
+	// Per thread tracking rules PID == TID: ensure that we get TID equals PID.
+	//
+	// Also ensure that exported events have the TID set. For events from kernel
 	// we usually use PID == 0, so instead of checking against 0, assert that
-	// TGID == TID
+	// TGID == TID.
+	//
+	// TODO: remove the TID from bpf side and the following check.
 	if process.PID != process.TID {
 		logger.GetLogger().WithFields(logrus.Fields{
 			"event.name":            "Execve",
@@ -261,8 +265,11 @@ func initProcessInternalClone(event *tetragonAPI.MsgCloneEvent,
 	pi.process.ParentExecId = parentExecId
 	pi.process.ExecId = GetProcessID(event.PID, event.Ktime)
 	pi.process.Pid = &wrapperspb.UInt32Value{Value: event.PID}
-	// Since from BPF side we only generate one clone event per
-	// thread group that is for the leader, assert on that.
+	// Per thread tracking rules PID == TID: ensure that we get TID equals PID.
+	//  Since from BPF side we only generate one clone event per
+	//  thread group that is for the leader, assert on that.
+	//
+	// TODO: remove the TID from bpf side and the following check.
 	if event.PID != event.TID {
 		logger.GetLogger().WithFields(logrus.Fields{
 			"event.name":            "Clone",