Skip to content

Commit

Permalink
grpc: nil check process.Pid
Browse files Browse the repository at this point in the history
We have had segmentation faults where process.Pid is sometimes nil due to errors in the
cache. Although another patch in this series addresses the current issues, it's still
a bug that tends to crop up from time to time. Add a metric to keep track of these and
perform a nil check so that Tetragon can keep running.

Signed-off-by: William Findlay <will@isovalent.com>
  • Loading branch information
willfindlay committed Nov 2, 2023
1 parent 6b34552 commit d7a7801
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 0 deletions.
12 changes: 12 additions & 0 deletions pkg/grpc/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/cilium/tetragon/pkg/ktime"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
"github.com/cilium/tetragon/pkg/option"
"github.com/cilium/tetragon/pkg/process"
readerexec "github.com/cilium/tetragon/pkg/reader/exec"
Expand Down Expand Up @@ -67,6 +68,11 @@ func GetProcessExec(event *MsgExecveEventUnix, useCache bool) *tetragon.ProcessE
Parent: tetragonParent,
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExec: nil Process.Pid").Inc()
return nil
}

if useCache {
if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonEvent.Process) || (tetragonProcess.Pid.Value > 1 && ec.Needed(tetragonEvent.Parent))) {
Expand Down Expand Up @@ -385,6 +391,12 @@ func GetProcessExit(event *MsgExitEventUnix) *tetragon.ProcessExit {
Status: code,
Time: ktime.ToProto(event.Common.Ktime),
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExit: nil Process.Pid").Inc()
return nil
}

ec := eventcache.Get()
if ec != nil &&
(ec.Needed(tetragonProcess) ||
Expand Down
21 changes: 21 additions & 0 deletions pkg/grpc/tracing/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package tracing
import (
"fmt"

"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
"github.com/cilium/tetragon/pkg/reader/kernel"
"golang.org/x/sys/unix"

Expand Down Expand Up @@ -304,6 +305,11 @@ func GetProcessKprobe(event *MsgGenericKprobeUnix) *tetragon.ProcessKprobe {
PolicyName: event.PolicyName,
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessKprobe: nil Process.Pid").Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) ||
(tetragonProcess.Pid.Value > 1 && ec.Needed(tetragonParent))) {
Expand Down Expand Up @@ -413,6 +419,11 @@ func (msg *MsgGenericTracepointUnix) HandleMessage() *tetragon.GetEventsResponse
Action: kprobeAction(msg.Action),
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessTracepoint: nil Process.Pid").Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) ||
(tetragonProcess.Pid.Value > 1 && ec.Needed(tetragonParent))) {
Expand Down Expand Up @@ -532,6 +543,11 @@ func GetProcessLoader(msg *MsgProcessLoaderUnix) *tetragon.ProcessLoader {
tetragonProcess = process.UnsafeGetProcess()
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessLoader: nil Process.Pid").Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) || (tetragonProcess.Pid.Value > 1)) {
tetragonEvent := &ProcessLoaderNotify{}
Expand Down Expand Up @@ -639,6 +655,11 @@ func GetProcessUprobe(event *MsgGenericUprobeUnix) *tetragon.ProcessUprobe {
PolicyName: event.PolicyName,
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessUprobe: nil Process.Pid").Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) ||
(tetragonProcess.Pid.Value > 1 && ec.Needed(tetragonParent))) {
Expand Down
12 changes: 12 additions & 0 deletions pkg/metrics/eventcachemetrics/eventcachemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,19 @@ var (
Help: "The total number of Tetragon event cache accesses. For internal use only.",
ConstLabels: nil,
})
eventCacheErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "event_cache_errors_total",
Help: "The total of errors encountered while fetching process exec information from the cache.",
ConstLabels: nil,
}, []string{"error"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(processInfoErrors)
registry.MustRegister(podInfoErrors)
registry.MustRegister(EventCacheCount)
registry.MustRegister(eventCacheErrorsTotal)
}

// Get a new handle on an processInfoErrors metric for an eventType
Expand All @@ -44,3 +51,8 @@ func ProcessInfoError(eventType string) prometheus.Counter {
func PodInfoError(eventType string) prometheus.Counter {
return podInfoErrors.WithLabelValues(eventType)
}

// Get a new handle on an processInfoErrors metric for an eventType
func EventCacheError(err string) prometheus.Counter {
return eventCacheErrorsTotal.WithLabelValues(err)
}

0 comments on commit d7a7801

Please sign in to comment.