From 9983e87440760883b2a8ecd83d93f1b0dd78d44a Mon Sep 17 00:00:00 2001 From: Matt Heon Date: Tue, 6 Feb 2024 16:08:39 -0500 Subject: [PATCH 1/2] Remove leftover autoremove containers during refresh During system shutdown, Podman should go down gracefully, meaning that we have time to spawn cleanup processes which remove any containers set to autoremove. Unfortunately, this isn't always the case. If we get a SIGKILL because the system is going down immediately, we can't recover from this, and the autoremove containers are not removed. However, we can pick up any leftover autoremove containers when we refesh the DB state, which is the first thing Podman does after a reboot. By detecting any autoremove containers that have actually run (a container that was created but never run doesn't need to be removed) at that point and removing them, we keep the fresh boot clean, even if Podman was terminated abnormally. Fixes #21482 [NO NEW TESTS NEEDED] This requires a reboot to realistically test. Signed-off-by: Matt Heon --- libpod/runtime.go | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/libpod/runtime.go b/libpod/runtime.go index 3195742e70..094d6df55d 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -166,7 +166,7 @@ func NewRuntime(ctx context.Context, options ...RuntimeOption) (*Runtime, error) if err != nil { return nil, err } - return newRuntimeFromConfig(conf, options...) + return newRuntimeFromConfig(ctx, conf, options...) } // NewRuntimeFromConfig creates a new container runtime using the given @@ -175,10 +175,10 @@ func NewRuntime(ctx context.Context, options ...RuntimeOption) (*Runtime, error) // An error will be returned if the configuration file at the given path does // not exist or cannot be loaded func NewRuntimeFromConfig(ctx context.Context, userConfig *config.Config, options ...RuntimeOption) (*Runtime, error) { - return newRuntimeFromConfig(userConfig, options...) + return newRuntimeFromConfig(ctx, userConfig, options...) } -func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runtime, error) { +func newRuntimeFromConfig(ctx context.Context, conf *config.Config, options ...RuntimeOption) (*Runtime, error) { runtime := new(Runtime) if conf.Engine.OCIRuntime == "" { @@ -223,7 +223,7 @@ func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runti return nil, fmt.Errorf("starting shutdown signal handler: %w", err) } - if err := makeRuntime(runtime); err != nil { + if err := makeRuntime(ctx, runtime); err != nil { return nil, err } @@ -333,7 +333,7 @@ func getDBState(runtime *Runtime) (State, error) { // Make a new runtime based on the given configuration // Sets up containers/storage, state store, OCI runtime -func makeRuntime(runtime *Runtime) (retErr error) { +func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) { // Find a working conmon binary cPath, err := runtime.config.FindConmon() if err != nil { @@ -629,6 +629,13 @@ func makeRuntime(runtime *Runtime) (retErr error) { return err } + // Mark the runtime as valid - ready to be used, cannot be modified + // further. + // Need to do this *before* refresh as we can remove containers there. + // Should not be a big deal as we don't return it to users until after + // refresh runs. + runtime.valid = true + // If we need to refresh the state, do it now - things are guaranteed to // be set up by now. if doRefresh { @@ -639,17 +646,13 @@ func makeRuntime(runtime *Runtime) (retErr error) { } } - if err2 := runtime.refresh(runtimeAliveFile); err2 != nil { + if err2 := runtime.refresh(ctx, runtimeAliveFile); err2 != nil { return err2 } } runtime.startWorker() - // Mark the runtime as valid - ready to be used, cannot be modified - // further - runtime.valid = true - return nil } @@ -819,7 +822,7 @@ func (r *Runtime) Shutdown(force bool) error { // Reconfigures the runtime after a reboot // Refreshes the state, recreating temporary files // Does not check validity as the runtime is not valid until after this has run -func (r *Runtime) refresh(alivePath string) error { +func (r *Runtime) refresh(ctx context.Context, alivePath string) error { logrus.Debugf("Podman detected system restart - performing state refresh") // Clear state of database if not running in container @@ -856,6 +859,22 @@ func (r *Runtime) refresh(alivePath string) error { if err := ctr.refresh(); err != nil { logrus.Errorf("Refreshing container %s: %v", ctr.ID(), err) } + // This is the only place it's safe to use ctr.state.State unlocked + // We're holding the alive lock, guaranteed to be the only Libpod on the system right now. + if ctr.AutoRemove() && ctr.state.State == define.ContainerStateExited { + opts := ctrRmOpts{ + // Don't force-remove, we're supposed to be fresh off a reboot + // If we have to force something is seriously wrong + Force: false, + RemoveVolume: true, + } + // This container should have autoremoved before the + // reboot but did not. + // Get rid of it. + if _, _, err := r.removeContainer(ctx, ctr, opts); err != nil { + logrus.Errorf("Unable to remove container %s which should have autoremoved: %v", ctr.ID(), err) + } + } } for _, pod := range pods { if err := pod.refresh(); err != nil { From 3cf2f8ccf4233c6d9707b45a7315af73718be78e Mon Sep 17 00:00:00 2001 From: Matt Heon Date: Wed, 7 Feb 2024 08:33:56 -0500 Subject: [PATCH 2/2] Handle more states during refresh We were preserving ContainerStateExited, which is better than nothing, but definitely not correct. A container that ran at any point during the last boot should be moved to Exited state to preserve the fact that they were run at least one. This means we have to convert Running, Stopped, Stopping, Paused containers to exited as well. Signed-off-by: Matt Heon --- libpod/container_internal.go | 14 +++++++++++++- libpod/runtime.go | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 361a342f3f..b33cedca43 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -622,7 +622,19 @@ func resetContainerState(state *ContainerState) { state.ConmonPID = 0 state.Mountpoint = "" state.Mounted = false - if state.State != define.ContainerStateExited { + // Reset state. + // Almost all states are reset to either Configured or Exited, + // except ContainerStateRemoving which is preserved. + switch state.State { + case define.ContainerStateStopped, define.ContainerStateExited, define.ContainerStateStopping, define.ContainerStateRunning, define.ContainerStatePaused: + // All containers that ran at any point during the last boot + // must be placed in the Exited state. + state.State = define.ContainerStateExited + case define.ContainerStateConfigured, define.ContainerStateCreated: + state.State = define.ContainerStateConfigured + case define.ContainerStateUnknown: + // Something really strange must have happened to get us here. + // Reset to configured, maybe the reboot cleared things up? state.State = define.ContainerStateConfigured } state.ExecSessions = make(map[string]*ExecSession) diff --git a/libpod/runtime.go b/libpod/runtime.go index 094d6df55d..42009c0d8f 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -861,7 +861,7 @@ func (r *Runtime) refresh(ctx context.Context, alivePath string) error { } // This is the only place it's safe to use ctr.state.State unlocked // We're holding the alive lock, guaranteed to be the only Libpod on the system right now. - if ctr.AutoRemove() && ctr.state.State == define.ContainerStateExited { + if (ctr.AutoRemove() && ctr.state.State == define.ContainerStateExited) || ctr.state.State == define.ContainerStateRemoving { opts := ctrRmOpts{ // Don't force-remove, we're supposed to be fresh off a reboot // If we have to force something is seriously wrong