From bc9e857d0f85accbe97d62b94c7c2e8f1427af08 Mon Sep 17 00:00:00 2001 From: Tonis Tiigi Date: Fri, 16 Aug 2024 11:46:09 +0300 Subject: [PATCH] executor: detect containers killed by OOMKiller If container exits with error and has invoked OOMKiller mark the origin error as ENOMEM so that it can be detected on the client side. gRPC will set ENOMEM as codes.ResouceExhausted based on #5182 Signed-off-by: Tonis Tiigi --- executor/runcexecutor/executor.go | 9 +++-- executor/runcexecutor/executor_linux.go | 46 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/executor/runcexecutor/executor.go b/executor/runcexecutor/executor.go index 2a50ef959fb4a..3606ee719dbb8 100644 --- a/executor/runcexecutor/executor.go +++ b/executor/runcexecutor/executor.go @@ -335,7 +335,7 @@ func (w *runcExecutor) Run(ctx context.Context, id string, root executor.Mount, } doReleaseNetwork = false - err = exitError(ctx, err) + err = exitError(ctx, cgroupPath, err) if err != nil { if rec != nil { rec.Close() @@ -351,7 +351,7 @@ func (w *runcExecutor) Run(ctx context.Context, id string, root executor.Mount, return rec, rec.CloseAsync(releaseContainer) } -func exitError(ctx context.Context, err error) error { +func exitError(ctx context.Context, cgroupPath string, err error) error { if err != nil { exitErr := &gatewayapi.ExitError{ ExitCode: gatewayapi.UnknownExitStatus, @@ -363,6 +363,9 @@ func exitError(ctx context.Context, err error) error { ExitCode: uint32(runcExitError.Status), } } + + detectOOM(ctx, cgroupPath, exitErr) + trace.SpanFromContext(ctx).AddEvent( "Container exited", trace.WithAttributes( @@ -453,7 +456,7 @@ func (w *runcExecutor) Exec(ctx context.Context, id string, process executor.Pro } err = w.exec(ctx, id, spec.Process, process, nil) - return exitError(ctx, err) + return exitError(ctx, "", err) } type forwardIO struct { diff --git a/executor/runcexecutor/executor_linux.go b/executor/runcexecutor/executor_linux.go index 6ae39d6eefcc1..350075cdd21f3 100644 --- a/executor/runcexecutor/executor_linux.go +++ b/executor/runcexecutor/executor_linux.go @@ -1,14 +1,19 @@ package runcexecutor import ( + "bufio" "context" "io" "os" + "path/filepath" + "strconv" + "strings" "syscall" "github.com/containerd/console" runc "github.com/containerd/go-runc" "github.com/moby/buildkit/executor" + gatewayapi "github.com/moby/buildkit/frontend/gateway/pb" "github.com/moby/buildkit/util/bklog" "github.com/moby/sys/signal" "github.com/opencontainers/runtime-spec/specs-go" @@ -172,3 +177,44 @@ func (w *runcExecutor) callWithIO(ctx context.Context, process executor.ProcessI return call(ctx, startedCh, runcIO, killer.pidfile) } + +func detectOOM(ctx context.Context, ns string, gwErr *gatewayapi.ExitError) { + const defaultCgroupMountpoint = "/sys/fs/cgroup" + + if ns == "" { + return + } + + count, err := readMemoryEvent(filepath.Join(defaultCgroupMountpoint, ns), "oom_kill") + if err != nil { + bklog.G(ctx).WithError(err).Warn("failed to read oom_kill event") + return + } + if count > 0 { + gwErr.Err = syscall.ENOMEM + } +} + +func readMemoryEvent(fp string, event string) (uint64, error) { + f, err := os.Open(filepath.Join(fp, "memory.events")) + if err != nil { + return 0, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + parts := strings.Fields(s.Text()) + if len(parts) != 2 { + continue + } + if parts[0] != event { + continue + } + v, err := strconv.ParseUint(parts[1], 10, 64) + if err == nil { + return v, nil + } + } + return 0, s.Err() +}