Skip to content

Commit

Permalink
seize: use separate checkpoint_devices function
Browse files Browse the repository at this point in the history
Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to
ensure that the function's sole responsibility is to use the cgroup
freezer for the process tree. This allows us to avoid a time-out
error when checkpointing applications with large GPU state.

Suggested-by: Andrei Vagin <avagin@google.com>
Suggested-by: Jesus Ramos <jeramos@nvidia.com>
Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
  • Loading branch information
rst0git committed Jan 15, 2025
1 parent 9f6e4e6 commit 14ae222
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 7 deletions.
3 changes: 3 additions & 0 deletions criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -2192,6 +2192,9 @@ int cr_dump_tasks(pid_t pid)
if (collect_pstree())
goto err;

if (checkpoint_devices())
goto err;

if (collect_pstree_ids())
goto err;

Expand Down
1 change: 1 addition & 0 deletions criu/include/seize.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define __CR_SEIZE_H__

extern int collect_pstree(void);
extern int checkpoint_devices(void);
struct pstree_item;
extern void pstree_switch_state(struct pstree_item *root_item, int st);
extern const char *get_real_freezer_state(void);
Expand Down
23 changes: 16 additions & 7 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,6 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real;
int ret, exit_code = -1;
struct proc_status_creds creds;
struct pstree_item *iter;

timing_start(TIME_FREEZING);

Expand Down Expand Up @@ -1111,6 +1110,21 @@ int collect_pstree(void)
goto err;
}

exit_code = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);

err:
/* Freezing stage finished in time - disable timer. */
alarm(0);
return exit_code;
}

int checkpoint_devices(void)
{
struct pstree_item *iter;
int ret, exit_code = -1;

for_each_pstree_item(iter) {
if (!task_alive(iter))
continue;
Expand All @@ -1120,11 +1134,6 @@ int collect_pstree(void)
}

exit_code = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);

err:
/* Freezing stage finished in time - disable timer. */
alarm(0);
return exit_code;
}
}

0 comments on commit 14ae222

Please sign in to comment.