From 868e9fa9f520dcf1142491f82b858b93fc451644 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:34:17 -0800 Subject: [PATCH] freeze_processes: implement kludges for cgroup v1 Cgroup v1 freezer has always been problematic, failing to freeze a cgroup. In runc, we have implemented a few kludges to increase the chance of succeeding, but those are used when runc freezes a cgroup for its own purposes (for "runc pause" and to modify device properties for cgroup v1). When criu is used, it fails to freeze a cgroup from time to time (see [1], [2]). Let's try adding kludges similar to ones in runc. Alas, I have absolutely no way to test this, so please review carefully. [1]: https://github.com/opencontainers/runc/issues/4273 [2]: https://github.com/opencontainers/runc/issues/4457 Signed-off-by: Kir Kolyshkin --- criu/seize.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/criu/seize.c b/criu/seize.c index 6701446aec..829d7c2783 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -539,6 +539,34 @@ static int prepare_freezer_for_interrupt_only_mode(void) return exit_code; } +static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { + /* As per older kernel docs (freezer-subsystem.txt before + * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + * userspace should either retry or thaw. While current + * kernel cgroup v1 docs no longer mention a need to retry, + * even recent kernels can't reliably freeze a cgroup v1. + * + * Let's keep asking the kernel to freeze from time to time. + * In addition, do occasional thaw/sleep/freeze. + * + * This is still a game of chances (the real fix belongs to the kernel) + * but these kludges might improve the probability of success. + * + * Cgroup v2 does not have this problem. + */ + switch (iter % 32) { + case 9: + case 20: + freezer_write_state(fd, FROZEN); + break; + case 31: + freezer_write_state(fd, THAWED); + nanosleep(req, NULL); + freezer_write_state(fd, FROZEN); + break; + } +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -597,6 +625,9 @@ static int freeze_processes(void) if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; + if (!cgroup_v2) + cgroupv1_freezer_kludges(fd, i, &req); + nanosleep(&req, NULL); }