Skip to content

Commit

Permalink
Set container oom killed state correctly
Browse files Browse the repository at this point in the history
Parse the oom file created by conmon when a container is
oomkilled and set the oomkilled value to true in the container
state. conmon writes the oom file in a persist directory that
podman checks now for any oom files for a given container.

Signed-off-by: Urvashi Mohnani <umohnani@redhat.com>
  • Loading branch information
umohnani8 committed May 17, 2023
1 parent a120184 commit ac7fa3d
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 11 deletions.
5 changes: 5 additions & 0 deletions libpod/container_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,11 @@ func (c *Container) execExitFileDir(sessionID string) string {
return filepath.Join(c.execBundlePath(sessionID), "exit")
}

// execOOMFileDir gets the path to the container's persist directory
func (c *Container) execOOMFileDir(sessionID string) string {
return filepath.Join(c.execBundlePath(sessionID), "persists", c.ID())
}

// execOCILog returns the file path for the exec sessions oci log
func (c *Container) execOCILog(sessionID string) string {
if !c.ociRuntime.SupportsJSONErrors() {
Expand Down
25 changes: 19 additions & 6 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strconv"
Expand Down Expand Up @@ -142,6 +143,10 @@ func (c *Container) exitFilePath() (string, error) {
return c.ociRuntime.ExitFilePath(c)
}

func (c *Container) oomFilePath() (string, error) {
return c.ociRuntime.OOMFilePath(c)
}

// Wait for the container's exit file to appear.
// When it does, update our state based on it.
func (c *Container) waitForExitFileAndSync() error {
Expand Down Expand Up @@ -191,7 +196,10 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
}
c.state.ExitCode = int32(statusCode)

oomFilePath := filepath.Join(c.bundlePath(), "oom")
oomFilePath, err := c.oomFilePath()
if err != nil {
return err
}
if _, err = os.Stat(oomFilePath); err == nil {
c.state.OOMKilled = true
}
Expand Down Expand Up @@ -725,11 +733,6 @@ func (c *Container) removeConmonFiles() error {
return fmt.Errorf("removing container %s winsz file: %w", c.ID(), err)
}

oomFile := filepath.Join(c.bundlePath(), "oom")
if err := os.Remove(oomFile); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("removing container %s OOM file: %w", c.ID(), err)
}

// Remove the exit file so we don't leak memory in tmpfs
exitFile, err := c.exitFilePath()
if err != nil {
Expand All @@ -739,6 +742,16 @@ func (c *Container) removeConmonFiles() error {
return fmt.Errorf("removing container %s exit file: %w", c.ID(), err)
}

// Remove the persist dir for the container
oomFile, err := c.oomFilePath()
if err != nil {
return err
}
persistDir, _ := filepath.Split(oomFile)
if err := os.RemoveAll(persistDir); err != nil && !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("removing container %s persist directory: %w", c.ID(), err)
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions libpod/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ type OCIRuntime interface { //nolint:interfacebloat
// This is the path to that file for a given container.
ExitFilePath(ctr *Container) (string, error)

OOMFilePath(ctr *Container) (string, error)

// RuntimeInfo returns verbose information about the runtime.
RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error)

Expand Down
41 changes: 38 additions & 3 deletions libpod/oci_conmon_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type ConmonOCIRuntime struct {
supportsKVM bool
supportsNoCgroups bool
enableKeyring bool
persistDir string
}

// Make a new Conmon-based OCI runtime with the given options.
Expand Down Expand Up @@ -143,7 +144,12 @@ func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtime
return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg)
}

// exitDir can be replaced with persistDir completely as the exit file is written there as well
// but this would be a breaking change, so something to do in podman 5.0
// TODO: podman v5.0
runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
// The persist-dir is where conmon writes the exit file and oom file if the container was oom killed, we join the container ID to this path later on
runtime.persistDir = filepath.Join(runtime.tmpDir, "persists")

// Create the exit files and attach sockets directories
if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
Expand Down Expand Up @@ -897,6 +903,16 @@ func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) {
return filepath.Join(r.exitsDir, ctr.ID()), nil
}

func (r *ConmonOCIRuntime) OOMFilePath(ctr *Container) (string, error) {
if ctr == nil {
return "", fmt.Errorf("must provide a valid container to get oom file path: %w", define.ErrInvalidArg)
}
if !strings.Contains(r.persistDir, ctr.ID()) {
r.persistDir = filepath.Join(r.persistDir, ctr.ID())
}
return filepath.Join(r.persistDir, "oom"), nil
}

// RuntimeInfo provides information on the runtime.
func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) {
runtimePackage := packageVersion(r.path)
Expand Down Expand Up @@ -1035,7 +1051,10 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
pidfile = filepath.Join(ctr.state.RunDir, "pidfile")
}

args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)
args, err := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, r.persistDir, ociLog, ctr.LogDriver(), logTag)
if err != nil {
return 0, err
}

if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" {
args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket))
Expand Down Expand Up @@ -1289,7 +1308,22 @@ func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string {
}

// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI
func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string {
func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, persistDir, ociLogPath, logDriver, logTag string) ([]string, error) {
if !strings.Contains(persistDir, ctr.ID()) {
persistDir = filepath.Join(persistDir, ctr.ID())
}
// Make the persists directory for the container after appending the ctr ID to it
// This is needed as conmon writes the exit and oom file in the given persist directory path as just "exit" and "oom"
// So creating a directory with the container ID under the persist dir will help keep track of which container the
// exit and oom files belong to.
// TODO: v5.0 use the exit file from the persist dir only and discard the use of exit-dir
if err := os.MkdirAll(persistDir, 0750); err != nil {
// The directory is allowed to exist
if !os.IsExist(err) {
return nil, fmt.Errorf("creating OCI runtime oom files directory for ctr %q: %w", ctr.ID(), err)
}
}

// set the conmon API version to be able to use the correct sync struct keys
args := []string{
"--api-version", "1",
Expand All @@ -1300,6 +1334,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
"-p", pidPath,
"-n", ctr.Name(),
"--exit-dir", exitDir,
"--persist-dir", persistDir,
"--full-attach",
}
if len(r.runtimeFlags) > 0 {
Expand Down Expand Up @@ -1364,7 +1399,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
logrus.Debugf("Running with no Cgroups")
args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled")
}
return args
return args, nil
}

func startCommand(cmd *exec.Cmd, ctr *Container) error {
Expand Down
5 changes: 4 additions & 1 deletion libpod/oci_conmon_exec_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,10 @@ func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *Ex
}
defer processFile.Close()

args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag)
args, err := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), c.execOOMFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag)
if err != nil {
return nil, nil, err
}

if options.PreserveFDs > 0 {
args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...)
Expand Down
15 changes: 14 additions & 1 deletion libpod/oci_missing.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"net/http"
"path/filepath"
"strings"
"sync"

"github.com/containers/common/pkg/resize"
Expand All @@ -26,7 +27,8 @@ type MissingRuntime struct {
// Name is the name of the missing runtime. Will be used in errors.
name string
// exitsDir is the directory for exit files.
exitsDir string
exitsDir string
persistDir string
}

// Get a new MissingRuntime for the given name.
Expand All @@ -50,6 +52,7 @@ func getMissingRuntime(name string, r *Runtime) OCIRuntime {
newRuntime := new(MissingRuntime)
newRuntime.name = name
newRuntime.exitsDir = filepath.Join(r.config.Engine.TmpDir, "exits")
newRuntime.persistDir = filepath.Join(r.config.Engine.TmpDir, "persists")

missingRuntimes[name] = newRuntime

Expand Down Expand Up @@ -220,6 +223,16 @@ func (r *MissingRuntime) ExitFilePath(ctr *Container) (string, error) {
return filepath.Join(r.exitsDir, ctr.ID()), nil
}

func (r *MissingRuntime) OOMFilePath(ctr *Container) (string, error) {
if ctr == nil {
return "", fmt.Errorf("must provide a valid container to get oom file path: %w", define.ErrInvalidArg)
}
if !strings.Contains(r.persistDir, ctr.ID()) {
r.persistDir = filepath.Join(r.persistDir, ctr.ID())
}
return filepath.Join(r.persistDir, "oom"), nil
}

// RuntimeInfo returns information on the missing runtime
func (r *MissingRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) {
ocirt := define.OCIRuntimeInfo{
Expand Down
36 changes: 36 additions & 0 deletions test/e2e/run_memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package integration
import (
"fmt"

"github.com/containers/podman/v4/test/utils"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
. "github.com/onsi/gomega/gexec"
Expand Down Expand Up @@ -74,4 +75,39 @@ var _ = Describe("Podman run memory", func() {
Expect(session.OutputToString()).To(Equal(limit))
})
}

It("podman run memory test on oomkilled container", func() {
mem := utils.SystemExec("cat", []string{"/proc/sys/vm/overcommit_memory"})
mem.WaitWithDefaultTimeout()
if mem.OutputToString() != "0" {
Skip("overcommit memory is not set to 0")
}

ctrName := "oomkilled-ctr"
// create a container that gets oomkilled
session := podmanTest.Podman([]string{"run", "--name", ctrName, "--read-only", "--memory-swap=20m", "--memory=20m", "--oom-score-adj=1000", ALPINE, "sort", "/dev/urandom"})
session.WaitWithDefaultTimeout()
Expect(session).Should(Exit(137))

inspect := podmanTest.Podman(([]string{"inspect", "--format", "{{.State.OOMKilled}} {{.State.ExitCode}}", ctrName}))
inspect.WaitWithDefaultTimeout()
Expect(inspect).Should(Exit(0))
// Check oomkilled and exit code values
Expect(inspect.OutputToString()).Should(ContainSubstring("true"))
Expect(inspect.OutputToString()).Should(ContainSubstring("137"))
})

It("podman run memory test on successfully exited container", func() {
ctrName := "success-ctr"
session := podmanTest.Podman([]string{"run", "--name", ctrName, "--memory=40m", ALPINE, "echo", "hello"})
session.WaitWithDefaultTimeout()
Expect(session).Should(Exit(0))

inspect := podmanTest.Podman(([]string{"inspect", "--format", "{{.State.OOMKilled}} {{.State.ExitCode}}", ctrName}))
inspect.WaitWithDefaultTimeout()
Expect(inspect).Should(Exit(0))
// Check oomkilled and exit code values
Expect(inspect.OutputToString()).Should(ContainSubstring("false"))
Expect(inspect.OutputToString()).Should(ContainSubstring("0"))
})
})

0 comments on commit ac7fa3d

Please sign in to comment.