diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 68b99d6075..fb0f95b940 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3136,6 +3136,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); RST_MEM_FIXUP_PPTR(task_args->seccomp_filters); + RST_MEM_FIXUP_PPTR(task_args->vma_ios); if (core->tc->has_seccomp_mode) task_args->seccomp_mode = core->tc->seccomp_mode; diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index eded0d3515..b188522793 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -52,6 +52,9 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); + /* Whether or not pages can be read in PIE code */ + bool pieok; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; @@ -95,8 +98,11 @@ extern int open_page_read(int pid, struct page_read *, int pr_flags); extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags); +struct task_restore_args; + int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to); +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 81839f3987..2748544c94 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -101,6 +101,14 @@ struct thread_restore_args { typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args); +struct restore_vma_io { + int nr_iovs; + loff_t off; + struct iovec iovs[0]; +}; + +#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) + struct task_restore_args { struct thread_restore_args *t; /* thread group leader */ @@ -121,6 +129,10 @@ struct task_restore_args { VmaEntry *vmas; unsigned int vmas_n; + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; + struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 92dfc9d93b..c3dbe2dd3d 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -39,6 +39,8 @@ struct rst_info { struct vm_area_list vmas; struct _MmEntry *mm; + struct list_head vma_io; + unsigned int pages_img_id; u32 cg_set; diff --git a/criu/mem.c b/criu/mem.c index 21ab40bf5f..96bff619db 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -710,8 +710,34 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void return 0; } +static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) +{ + /* + * Growsdown VMAs always have one guard page at the + * beginning and sometimes this page contains data. + * In case the VMA is premmaped, we premmap one page + * larger VMA. In case of in place restore we can only + * do this if the VMA in question is not "guarded" by + * some other VMA. + */ + if (vma->e->flags & MAP_GROWSDOWN) { + if (vma->list.prev != head) { + struct vma_area *prev; + + prev = list_entry(vma->list.prev, struct vma_area, list); + if (prev->e->end == vma->e->start) { + pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n", + vma->e->start, vma->e->end); + return true; + } + } + } + + return false; +} + static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, - void *at, struct page_read *pr) + void **at, struct page_read *pr) { struct vma_area *vma; unsigned long pstart = 0; @@ -729,7 +755,14 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, if (!vma_area_is_private(vma, kdat.task_size)) continue; - ret = premap_private_vma(t, vma, &at); + if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) + /* + * VMA in question is not shared with anyone. We'll + * restore it with its contents in restorer. + */ + continue; + + ret = premap_private_vma(t, vma, at); if (ret < 0) break; } @@ -742,6 +775,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) struct vma_area *vma; int ret = 0; struct list_head *vmas = &rsti(t)->vmas.h; + struct list_head *vma_io = &rsti(t)->vma_io; unsigned int nr_restored = 0; unsigned int nr_shared = 0; @@ -750,6 +784,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned long va; vma = list_first_entry(vmas, struct vma_area, list); + rsti(t)->pages_img_id = pr->pages_img_id; /* * Read page contents. @@ -791,6 +826,28 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) goto err_addr; } + if (!vma_area_is(vma, VMA_PREMMAPED)) { + unsigned long len = min_t(unsigned long, + (nr_pages - i) * PAGE_SIZE, + vma->e->end - va); + + if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io)) + return -1; + + pr->skip_pages(pr, len); + + va += len; + len >>= PAGE_SHIFT; + nr_restored += len; + i += len - 1; + pr_debug("Enqueue page-read\n"); + continue; + } + + /* + * Otherwise to the COW restore + */ + off = (va - vma->e->start) / PAGE_SIZE; p = decode_pointer((off) * PAGE_SIZE + vma->premmaped_addr); @@ -925,7 +982,7 @@ int prepare_mappings(struct pstree_item *t) pr.advance(&pr); /* shift to the 1st iovec */ - ret = premap_priv_vmas(t, vmas, addr, &pr); + ret = premap_priv_vmas(t, vmas, &addr, &pr); if (ret < 0) goto out; @@ -942,6 +999,23 @@ int prepare_mappings(struct pstree_item *t) old_premmapped_addr, old_premmapped_len); } + /* + * Not all VMAs were premmaped. Find out the unused tail of the + * premapped area and unmap it. + */ + old_premmapped_len = addr - rsti(t)->premmapped_addr; + if (old_premmapped_len < rsti(t)->premmapped_len) { + unsigned long tail; + + tail = rsti(t)->premmapped_len - old_premmapped_len; + ret = munmap(addr, tail); + if (ret < 0) + pr_perror("Unable to unmap %p(%lx)", addr, tail); + rsti(t)->premmapped_len = old_premmapped_len; + pr_info("Shrunk premap area to %p(%lx)\n", + rsti(t)->premmapped_addr, rsti(t)->premmapped_len); + } + out: return ret; } @@ -995,6 +1069,18 @@ int open_vmas(struct pstree_item *t) return 0; } +static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) +{ + struct cr_img *pages; + + pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id); + if (!pages) + return -1; + + ta->vma_ios_fd = img_raw_fd(pages); + return pagemap_render_iovec(&rsti(t)->vma_io, ta); +} + int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) { struct vma_area *vma; @@ -1020,6 +1106,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) vma_premmaped_start(vme) = vma->premmaped_addr; } - return 0; + return prepare_vma_ios(t, ta); } diff --git a/criu/pagemap.c b/criu/pagemap.c index 512beed8ba..2e163ce5ff 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -10,7 +10,8 @@ #include "cr_options.h" #include "servicefd.h" #include "pagemap.h" - +#include "restorer.h" +#include "rst-malloc.h" #include "fault-injection.h" #include "xmalloc.h" #include "protobuf.h" @@ -309,6 +310,32 @@ static int enqueue_async_iov(struct page_read *pr, void *buf, return 0; } +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta) +{ + struct page_read_iov *piov; + + ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE); + ta->vma_ios_n = 0; + + list_for_each_entry(piov, from, l) { + struct restore_vma_io *rio; + + pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr, + piov->to[0].iov_base, piov->to[0].iov_len); + rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE); + if (!rio) + return -1; + + rio->nr_iovs = piov->nr; + rio->off = piov->from; + memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec)); + + ta->vma_ios_n++; + } + + return 0; +} + int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to) { @@ -641,6 +668,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags) pr->bunch.iov_len = 0; pr->bunch.iov_base = NULL; pr->pmes = NULL; + pr->pieok = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid); if (!pr->pmi) @@ -673,6 +701,8 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags) pr->sync = process_async_reads; pr->seek_pagemap = seek_pagemap; pr->id = ids++; + if (!pr->parent) + pr->pieok = true; pr_debug("Opened page read %u (parent %u)\n", pr->id, pr->parent ? pr->parent->id : 0); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index b219d9022c..db13d64ccb 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -595,6 +595,10 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL)) flags &= ~MAP_ANONYMOUS; + /* See comment in premap_private_vma() for this flag change */ + if (vma_entry_is(vma_entry, VMA_AREA_AIORING)) + flags |= MAP_ANONYMOUS; + /* A mapping of file with MAP_SHARED is up to date */ if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED)) prot |= PROT_WRITE; @@ -1082,7 +1086,7 @@ long __export_restore_task(struct task_restore_args *args) int i; VmaEntry *vma_entry; unsigned long va; - + struct restore_vma_io *rio; struct rt_sigframe *rt_sigframe; struct prctl_mm_map prctl_map; unsigned long new_sp; @@ -1179,7 +1183,8 @@ long __export_restore_task(struct task_restore_args *args) for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; - if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && + !vma_entry_is(vma_entry, VMA_AREA_AIORING)) continue; if (vma_entry_is(vma_entry, VMA_PREMMAPED)) @@ -1193,6 +1198,49 @@ long __export_restore_task(struct task_restore_args *args) } } + /* + * Now read the contents (if any) + */ + + rio = args->vma_ios; + for (i = 0; i < args->vma_ios_n; i++) { + struct iovec *iovs = rio->iovs; + int nr = rio->nr_iovs; + ssize_t r; + + while (nr) { + pr_debug("Preadv %lx:%d... (%d iovs)\n", + (unsigned long)iovs->iov_base, + (int)iovs->iov_len, nr); + r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + if (r < 0) { + pr_err("Can't read pages data (%d)\n", (int)r); + goto core_restore_end; + } + + pr_debug("`- returned %ld\n", (long)r); + rio->off += r; + /* Advance the iovecs */ + do { + if (iovs->iov_len <= r) { + pr_debug(" `- skip pagemap\n"); + r -= iovs->iov_len; + iovs++; + nr--; + continue; + } + + iovs->iov_base += r; + iovs->iov_len -= r; + break; + } while (nr > 0); + } + + rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); + } + + sys_close(args->vma_ios_fd); + #ifdef CONFIG_VDSO /* * Proxify vDSO. diff --git a/criu/pstree.c b/criu/pstree.c index b1006e7251..9b2d171dcb 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -209,6 +209,7 @@ struct pstree_item *__alloc_pstree_item(bool rst) memset(item, 0, sz); vm_area_list_init(&rsti(item)->vmas); + INIT_LIST_HEAD(&rsti(item)->vma_io); item->pid = (void *)item + sizeof(*item) + sizeof(struct rst_info); }