Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Freeze fixes and v1 kludges #2545

Merged
merged 3 commits into from
Jan 7, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 47 additions & 14 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@
static const char freezing[] = "FREEZING";
static const char thawed[] = "THAWED";

enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING };
enum freezer_state { FREEZER_ERROR = -1,
THAWED,
FROZEN,
FREEZING };

/* Track if we are running on cgroup v2 system. */
static bool cgroup_v2 = false;
Expand Down Expand Up @@ -536,13 +539,42 @@
return exit_code;
}

static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) {

Check warning on line 542 in criu/seize.c

View workflow job for this annotation

GitHub Actions / build

/* As per older kernel docs (freezer-subsystem.txt before
* the kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
* userspace should either retry or thaw. While current
* kernel cgroup v1 docs no longer mention a need to retry,
* even recent kernels can't reliably freeze a cgroup v1.
*
* Let's keep asking the kernel to freeze from time to time.
* In addition, do occasional thaw/sleep/freeze.
*
* This is still a game of chances (the real fix belongs to the kernel)
* but these kludges might improve the probability of success.
*
* Cgroup v2 does not have this problem.
*/
switch (iter % 32) {
case 9:

Check warning on line 558 in criu/seize.c

View workflow job for this annotation

GitHub Actions / build

case 20:
freezer_write_state(fd, FROZEN);
break;
case 31:
freezer_write_state(fd, THAWED);
nanosleep(req, NULL);
freezer_write_state(fd, FROZEN);
break;
}
}

static int freeze_processes(void)
{
int fd, exit_code = -1;
enum freezer_state state = THAWED;

static const unsigned long step_ms = 100;
unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms;
/* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */
unsigned long nr_attempts = (opts.timeout * 1000) / step_ms;
rst0git marked this conversation as resolved.
Show resolved Hide resolved
unsigned long i = 0;

const struct timespec req = {
Expand All @@ -551,14 +583,12 @@
};

if (unlikely(!nr_attempts)) {
/*
* If timeout is turned off, lets
* wait for at least 10 seconds.
*/
nr_attempts = (10 * 1000000) / step_ms;
/* If the timeout is 0, wait for at least 10 seconds. */
nr_attempts = (10 * 1000) / step_ms;
}

pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms);
pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n",
opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout);

fd = freezer_open();
if (fd < 0)
Expand All @@ -585,22 +615,25 @@
* not read @tasks pids while freezer in
* transition stage.
*/
for (; i <= nr_attempts; i++) {
while (1) {
state = get_freezer_state(fd);
if (state == FREEZER_ERROR) {
close(fd);
return -1;
}

if (state == FROZEN)
if (state == FROZEN || i++ == nr_attempts || alarm_timeouted())
break;
if (alarm_timeouted())
goto err;

if (!cgroup_v2)
cgroupv1_freezer_kludges(fd, i, &req);

nanosleep(&req, NULL);
}

if (i > nr_attempts) {
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
if (state != FROZEN) {
pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n",
opts.freeze_cgroup, i, step_ms, opts.timeout);
if (!pr_quelled(LOG_DEBUG))
log_unfrozen_stacks(opts.freeze_cgroup);
goto err;
Expand Down
Loading